# Tweets to dataframe

In [50]:
import pandas as pd
import numpy as np
from textblob import TextBlob 
import re 
from textblob.sentiments import NaiveBayesAnalyzer
import time

In [51]:
class TweetAnalyzer():
    """
    Functionality for analyzing and categorizing content from tweets.
    """

    def clean_tweet(self, tweet):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

    def analyze_sentiment(self, tweet):
        return TextBlob(self.clean_tweet(tweet), analyzer=NaiveBayesAnalyzer())
        
    def tweets_to_data_frame(self, tweets):
        df = pd.DataFrame(data=[tweet['full_text'] for tweet in tweets], columns=['full_text'])
        df['id'] = np.array([tweet['id'] for tweet in tweets])
        df['date'] = np.array([tweet['created_at'] for tweet in tweets])
        df['city'] = [tweet['place']['full_name'] for tweet in tweets]
        df['country_code'] = [tweet['place']['country_code'] for tweet in tweets]
        df['country'] = [tweet['place']['country'] for tweet in tweets]
        df['coordinates'] = [tweet['coordinates']['coordinates'] for tweet in tweets]

        return df

In [52]:
all_tweets = []
with open('./data/tweet_ids/hydrated/tweets.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    try:
      result = json.loads(json_str)
      all_tweets.append(result)
    except:
        pass


In [55]:
all_tweets = [tweet for tweet in all_tweets if type(tweet['place']) == dict]


In [57]:
ta = TweetAnalyzer()
df = ta.tweets_to_data_frame(all_tweets)

In [59]:
df = df[df['country_code'] == 'US']

In [62]:
df['state'] = [city[-2:] for city in df['city']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [63]:
valid_states = ['OH', 'CA', 'MA', 'FL', 'IL', 'MD', 'NC', 'NY', 'AZ',
       'LA', 'TX', 'UT', 'GA', 'NV', 'MI', 'NJ', 'IN', 'ME', 'KS', 'VA',
       'MN', 'TN', 'PA', 'SC', 'WI', 'NM', 'OR', 'MO', 'WA', 'DC',
       'AL', 'CT', 'ID', 'KY', 'MS', 'CO', 'OK', 'HI', 'AR', 'VT', 'RI',
       'NH', 'MT', 'DE', 'NE',  'SD', 'IA', 'ND', 'WV',  'AK',
        'WY']
len(valid_states)

51

In [64]:
df = df[df['state'].isin(valid_states)]

In [10]:
df['full_text'][0]

'#statewaterheaters #getitin #corona #keepingpeoplesafe #plumber #pickeringtonohio new water heater swap out @ Pickerington, Ohio https://t.co/fMcPtH88m4'

In [11]:
ta.analyze_sentiment(df['full_text'][1]).sentiment

Sentiment(classification='neg', p_pos=0.29355729258623176, p_neg=0.7064427074137692)

In [12]:
a,b,c = ta.analyze_sentiment(df['full_text'][1]).sentiment

In [13]:
print(a,b,c)

neg 0.29355729258623176 0.7064427074137692


In [14]:
df.shape[0]/2

32297.5

In [15]:
df = df.reset_index()

In [16]:
df.loc[0:32316,:].to_csv('./data/tweets/cleaned_tweets_first_half.csv',index=False)
df.loc[32317:,:].to_csv('./data/tweets/cleaned_tweets_second_half.csv',index=False)

In [69]:
df1 = pd.read_csv('./data/tweets/cleaned_tweets_first_half.csv')
df2 = pd.read_csv('./data/tweets/cleaned_tweets_second_half.csv')


In [17]:
df = pd.read_csv('./data/tweets/cleaned_tweets_second_half.csv')

s= time.time()
for i, tweet in enumerate(df['full_text']):
    analysis = ta.analyze_sentiment(tweet).sentiment
    df.loc[i,'classification'] = analysis[0]
    df.loc[i,'p_pos'] = analysis[1]
    df.loc[i,'p_neg'] = analysis[2]
    if i % 100 == 0:
        print(f'{i} of {df.shape[0]}, time elapsed: {(time.time() - s) / 60} minutes')
        df.to_csv('./data/tweets/analyzed_tweets_second_half.csv',index=False)
        
df.to_csv('./data/tweets/analyzed_tweets_second_half.csv',index=False)

0 of 32278, time elapsed: 0.06798480351765951 minutes
100 of 32278, time elapsed: 7.538872714837392 minutes
200 of 32278, time elapsed: 15.742492512861888 minutes
300 of 32278, time elapsed: 27.578842163085938 minutes
400 of 32278, time elapsed: 38.30125726064046 minutes
500 of 32278, time elapsed: 50.40722532669703 minutes
600 of 32278, time elapsed: 63.411704977353416 minutes
700 of 32278, time elapsed: 76.50852807362874 minutes
800 of 32278, time elapsed: 91.61251735289892 minutes
900 of 32278, time elapsed: 104.46022142569224 minutes
1000 of 32278, time elapsed: 111.25717825094858 minutes
1100 of 32278, time elapsed: 118.08280838330587 minutes
1200 of 32278, time elapsed: 124.8664674282074 minutes
1300 of 32278, time elapsed: 131.6899699131648 minutes
1400 of 32278, time elapsed: 138.5281332095464 minutes
1500 of 32278, time elapsed: 145.31273971001306 minutes
1600 of 32278, time elapsed: 152.13685722351073 minutes
1700 of 32278, time elapsed: 158.9136512994766 minutes
1800 of 3227

14800 of 32278, time elapsed: 1081.2313802599906 minutes
14900 of 32278, time elapsed: 1087.6046767870585 minutes
15000 of 32278, time elapsed: 1094.0213617563247 minutes
15100 of 32278, time elapsed: 1100.3978122591973 minutes
15200 of 32278, time elapsed: 1106.8155329187712 minutes
15300 of 32278, time elapsed: 1113.1895693937938 minutes
15400 of 32278, time elapsed: 1119.610774966081 minutes
15500 of 32278, time elapsed: 1126.0321133891741 minutes
15600 of 32278, time elapsed: 1132.4153486172358 minutes
15700 of 32278, time elapsed: 1138.8348956346513 minutes
15800 of 32278, time elapsed: 1146.326697476705 minutes
15900 of 32278, time elapsed: 1154.5686872045198 minutes
16000 of 32278, time elapsed: 1162.4211968183517 minutes
16100 of 32278, time elapsed: 1170.179374229908 minutes
16200 of 32278, time elapsed: 1178.1116020997365 minutes
16300 of 32278, time elapsed: 1185.9337579727173 minutes
16400 of 32278, time elapsed: 1193.845204750697 minutes
16500 of 32278, time elapsed: 1201.

29300 of 32278, time elapsed: 2120.9610553145408 minutes
29400 of 32278, time elapsed: 2127.764416106542 minutes
29500 of 32278, time elapsed: 2134.5054225722947 minutes
29600 of 32278, time elapsed: 2141.251318466663 minutes
29700 of 32278, time elapsed: 2147.9617732803026 minutes
29800 of 32278, time elapsed: 2154.7009014089904 minutes
29900 of 32278, time elapsed: 2161.411905090014 minutes
30000 of 32278, time elapsed: 2168.2205836375556 minutes
30100 of 32278, time elapsed: 2175.0515328645706 minutes
30200 of 32278, time elapsed: 2181.8325490077336 minutes
30300 of 32278, time elapsed: 2188.6327471256254 minutes
30400 of 32278, time elapsed: 2195.394830417633 minutes
30500 of 32278, time elapsed: 2202.2001981218655 minutes
30600 of 32278, time elapsed: 2209.0114501317344 minutes
30700 of 32278, time elapsed: 2215.759770317872 minutes
30800 of 32278, time elapsed: 2222.5008282780645 minutes
30900 of 32278, time elapsed: 2229.2021904349326 minutes
31000 of 32278, time elapsed: 2236.0

In [18]:
df.head()

Unnamed: 0,index,full_text,id,date,city,country_code,country,coordinates,state,classification,p_pos,p_neg
0,79321,While admitted to St. Francis Hospital between...,1268957415929757697,Fri Jun 05 17:26:38 +0000 2020,"Chicago, IL",US,United States,"[-87.67, 42.01]",IL,pos,0.840656,0.159344
1,79325,GOOD MORNING 😀 THANK YOU LORD FOR WAKING ME UP...,1268958726611927048,Fri Jun 05 17:31:51 +0000 2020,"Whittier, CA",US,United States,"[-118.03407, 33.97542]",CA,pos,0.988976,0.011024
2,79326,🗣🗣HAPPY BELATED BDAY TO Terra Renee Matthews!!...,1268959375248568335,Fri Jun 05 17:34:25 +0000 2020,"Berkeley, MO",US,United States,"[-90.33590071, 38.74932691]",MO,pos,0.60301,0.39699
3,79331,"Just posted a video @ Corona, California https...",1268960496746729472,Fri Jun 05 17:38:53 +0000 2020,"Corona, CA",US,United States,"[-117.566, 33.8753]",CA,pos,0.731935,0.268065
4,79335,News roundup: more covid tracing apps released...,1268962295587016704,Fri Jun 05 17:46:02 +0000 2020,"Chicago, IL",US,United States,"[-87.60509849, 41.89186335]",IL,pos,0.800698,0.199302


In [85]:
df = pd.read_csv('./data/tweets/analyzed_tweets_first_half.csv')
df_2 = pd.read_csv('./data/tweets/analyzed_tweets_second_half.csv')

In [87]:
df.drop(columns='index').head()

Unnamed: 0,full_text,id,date,city,country_code,country,coordinates,state,classification,p_pos,p_neg
0,#statewaterheaters #getitin #corona #keepingpe...,1.240728e+18,Thu Mar 19 19:53:17 +0000 2020,"Pickerington, OH",US,United States,"[-82.759, 39.8841]",OH,pos,0.636339,0.363661
1,"""ain't no humans outside! (corona!)"" 😂😂😂🤣 @ Cl...",1.240728e+18,Thu Mar 19 19:53:46 +0000 2020,"Cleveland, OH",US,United States,"[-81.6937, 41.4996]",OH,neg,0.293557,0.706443
2,"current weather in Corona: moderate rain, 56°F...",1.240729e+18,Thu Mar 19 19:56:46 +0000 2020,"Corona, CA",US,United States,"[-117.58, 33.88]",CA,pos,0.881447,0.118553
3,Quarantine 🏓 training ! Adding some kicks 💥. \...,1.240729e+18,Thu Mar 19 19:58:28 +0000 2020,"Los Angeles, CA",US,United States,"[-118.243, 34.0522]",CA,pos,0.818247,0.181753
4,#quarantine #stockpiling #food #sexylatino #la...,1.240731e+18,Thu Mar 19 20:05:01 +0000 2020,"Los Angeles, CA",US,United States,"[-118.355, 34.1509]",CA,pos,0.696982,0.303018


In [75]:
df_3 = pd.concat([df,df_2])

In [88]:
df_3.drop(columns='index').to_csv('./data/tweets/analyzed_tweets.csv', index=False)

In [91]:
df = pd.read_csv('./data/tweets/analyzed_tweets.csv')

In [92]:
df.head()

Unnamed: 0,full_text,id,date,city,country_code,country,coordinates,state,classification,p_pos,p_neg
0,#statewaterheaters #getitin #corona #keepingpe...,1.240728e+18,Thu Mar 19 19:53:17 +0000 2020,"Pickerington, OH",US,United States,"[-82.759, 39.8841]",OH,pos,0.636339,0.363661
1,"""ain't no humans outside! (corona!)"" 😂😂😂🤣 @ Cl...",1.240728e+18,Thu Mar 19 19:53:46 +0000 2020,"Cleveland, OH",US,United States,"[-81.6937, 41.4996]",OH,neg,0.293557,0.706443
2,"current weather in Corona: moderate rain, 56°F...",1.240729e+18,Thu Mar 19 19:56:46 +0000 2020,"Corona, CA",US,United States,"[-117.58, 33.88]",CA,pos,0.881447,0.118553
3,Quarantine 🏓 training ! Adding some kicks 💥. \...,1.240729e+18,Thu Mar 19 19:58:28 +0000 2020,"Los Angeles, CA",US,United States,"[-118.243, 34.0522]",CA,pos,0.818247,0.181753
4,#quarantine #stockpiling #food #sexylatino #la...,1.240731e+18,Thu Mar 19 20:05:01 +0000 2020,"Los Angeles, CA",US,United States,"[-118.355, 34.1509]",CA,pos,0.696982,0.303018


In [95]:
df.to_csv('./data/tweets/analyzed_tweets_for_tableau.csv', index=False, sep = '\n', quotechar = '\t')