In [8]:
import pandas as pd
pd.set_option("display.max_columns", None)

In [15]:
# load in data
df = pd.read_json("all_tweets.json", orient="split")

In [16]:
df.isna().sum()

id_str            0
screen_name       0
created_at        0
lang              0
source            0
retweet_count     0
favorite_count    0
is_retweet        0
full_text         0
dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264542 entries, 0 to 264541
Data columns (total 9 columns):
id_str            264542 non-null int64
screen_name       264542 non-null object
created_at        264542 non-null datetime64[ns]
lang              264542 non-null object
source            264542 non-null object
retweet_count     264542 non-null int64
favorite_count    264542 non-null int64
is_retweet        264542 non-null bool
full_text         264542 non-null object
dtypes: bool(1), datetime64[ns](1), int64(3), object(4)
memory usage: 18.4+ MB


In [19]:
# convert lang and source to category types
df["lang"] = df["lang"].astype("category")
df["source"] = df["source"].astype("category")

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264542 entries, 0 to 264541
Data columns (total 9 columns):
id_str            264542 non-null int64
screen_name       264542 non-null object
created_at        264542 non-null datetime64[ns]
lang              264542 non-null category
source            264542 non-null category
retweet_count     264542 non-null int64
favorite_count    264542 non-null int64
is_retweet        264542 non-null bool
full_text         264542 non-null object
dtypes: bool(1), category(2), datetime64[ns](1), int64(3), object(2)
memory usage: 15.1+ MB


In [35]:
contains_rt = list(df[df["full_text"].str.contains("RT @")].index)
startswith_rt = list(df[df["full_text"].str.startswith("RT @")].index)

In [28]:
len(df[df["is_retweet"] == True])

53438

In [38]:
# retrieve list of indexs that contain RT @ but don't start with them
contains_diff = list(set(contains_rt).difference(set(startswith_rt)))

In [47]:
for idx in contains_diff[:10]:
    print(df.loc[idx]["full_text"])

Yup RT @TheCoffee13: @JHarden13 are you watching game 6?
!!! RT @MarathonZO: Yo S/O @NipseyHussle showing love &amp; lacing me n @JHarden13 wit the Crenshaw gear..#TMC
Lol i cant cook RT @iamjeannerivera: @JHarden13 Cook for her!
We might shoot to the A this weekend!! RT @BlindFolksFilms: @PRIVALEDGE @JHarden13  Gotta Link up with yahll Soon Fam
Lol I did RT @FSDomino: @Edelman11 dude, did you just see Chara laugh in Smiths face?
Thank you! RT @arrbes7822: @JHarden13 you're soooo happy &amp;cute  ih the  Google+exclusive http://t.co/nOx7ub6bCp
Thanks bro! RT @DonteStallworth: @JHarden13 nice! Enjoy that one my dude.
Most def! We all would. RT @Pradajames: @JHarden13 I bet she'd LOVE an NBA season.
!!!!!!!! RT @lablab_conie_g: @JHarden13 Just give her your UNDYING LOVE! :D
!! RT @Adam24Williams: @JHarden13 a paid cruise to somewhere nice.


The discrepancy is due to quoted tweets, which are retweets but with the user's own text in response to the retweet. I feel alright dropping these to begin with since from the samples above, they're minimal in meaning (i.e., statements of agreement, thanks, etc. and not original Tweets). 

In [59]:
# create subset of data set that does not contain retweets/quoted tweets
no_retweets = df[df["is_retweet"] == False]

In [60]:
# drop is_retweet column
no_retweets.drop(columns=["is_retweet"], axis=1, inplace=True)

In [61]:
no_retweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211104 entries, 0 to 264541
Data columns (total 8 columns):
id_str            211104 non-null int64
screen_name       211104 non-null object
created_at        211104 non-null datetime64[ns]
lang              211104 non-null category
source            211104 non-null category
retweet_count     211104 non-null int64
favorite_count    211104 non-null int64
full_text         211104 non-null object
dtypes: category(2), datetime64[ns](1), int64(3), object(2)
memory usage: 11.9+ MB


In [63]:
no_retweets.isna().sum()

id_str            0
screen_name       0
created_at        0
lang              0
source            0
retweet_count     0
favorite_count    0
full_text         0
dtype: int64

In [65]:
# reset index
no_retweets.reset_index(drop=True, inplace=True)

In [66]:
no_retweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211104 entries, 0 to 211103
Data columns (total 8 columns):
id_str            211104 non-null int64
screen_name       211104 non-null object
created_at        211104 non-null datetime64[ns]
lang              211104 non-null category
source            211104 non-null category
retweet_count     211104 non-null int64
favorite_count    211104 non-null int64
full_text         211104 non-null object
dtypes: category(2), datetime64[ns](1), int64(3), object(2)
memory usage: 10.3+ MB


In [67]:
no_retweets.to_json("no_retweets.json", orient="split")