In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob 
import re 
from textblob.sentiments import NaiveBayesAnalyzer
import time

In [2]:
class TweetAnalyzer():
    """
    Functionality for analyzing and categorizing content from tweets.
    """
    def clean_tweet(self, tweet):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
    def analyze_sentiment(self, tweet):
        return TextBlob(self.clean_tweet(tweet), analyzer=NaiveBayesAnalyzer())
    def tweets_to_data_frame(self, tweets):
        df = pd.DataFrame(data=[tweet['full_text'] for tweet in tweets], columns=['full_text'])
        df['id'] = np.array([tweet['id'] for tweet in tweets])
        df['date'] = np.array([tweet['created_at'] for tweet in tweets])
        df['city'] = [tweet['place']['full_name'] for tweet in tweets]
        df['country_code'] = [tweet['place']['country_code'] for tweet in tweets]
        df['country'] = [tweet['place']['country'] for tweet in tweets]
        df['coordinates'] = [tweet['coordinates']['coordinates'] for tweet in tweets]
        return df
ta = TweetAnalyzer()

In [8]:
df = pd.read_csv('data/cleaned_tweets_first_half.csv')
s= time.time()
for i, tweet in enumerate(df['full_text']):
    analysis = ta.analyze_sentiment(tweet).sentiment
    df.loc[i,'classification'] = analysis[0]
    df.loc[i,'p_pos'] = analysis[1]
    df.loc[i,'p_neg'] = analysis[2]
    if i % 100 == 0:
        print(f'{i} of {df.shape[0]}, time elapsed: {(time.time() - s) / 60} minutes')
        df.to_csv('data/analyzed_tweets_first_half.csv',index=False)
df.to_csv('data/analyzed_tweets_first_half.csv',index=False)

0 of 32317, time elapsed: 0.07161455154418946 minutes
100 of 32317, time elapsed: 5.672915315628051 minutes
200 of 32317, time elapsed: 11.260807716846466 minutes
300 of 32317, time elapsed: 24.486242135365803 minutes
400 of 32317, time elapsed: 30.155870501200358 minutes
500 of 32317, time elapsed: 35.697546768188474 minutes
600 of 32317, time elapsed: 41.150390752156575 minutes
700 of 32317, time elapsed: 46.616823669274645 minutes
800 of 32317, time elapsed: 52.083095169067384 minutes
900 of 32317, time elapsed: 57.6159804503123 minutes
1000 of 32317, time elapsed: 63.15908446709315 minutes
1100 of 32317, time elapsed: 68.63733318646749 minutes
1200 of 32317, time elapsed: 74.10430946747462 minutes
1300 of 32317, time elapsed: 79.57755148410797 minutes
1400 of 32317, time elapsed: 85.03971330324809 minutes
1500 of 32317, time elapsed: 90.50809384981791 minutes
1600 of 32317, time elapsed: 95.97722940444946 minutes
1700 of 32317, time elapsed: 101.44578858216603 minutes
1800 of 32317

14800 of 32317, time elapsed: 874.6582193334897 minutes
14900 of 32317, time elapsed: 880.7848096847534 minutes
15000 of 32317, time elapsed: 886.8932587504387 minutes
15100 of 32317, time elapsed: 893.0075357317925 minutes
15200 of 32317, time elapsed: 899.1127063711484 minutes
15300 of 32317, time elapsed: 905.2354321519534 minutes
15400 of 32317, time elapsed: 911.3450409173965 minutes
15500 of 32317, time elapsed: 917.4438275178273 minutes
15600 of 32317, time elapsed: 923.5394740502039 minutes
15700 of 32317, time elapsed: 929.6361216346423 minutes
15800 of 32317, time elapsed: 935.73544596831 minutes
15900 of 32317, time elapsed: 941.8333589355151 minutes
16000 of 32317, time elapsed: 947.9356917182605 minutes
16100 of 32317, time elapsed: 954.0425194501877 minutes
16200 of 32317, time elapsed: 960.135777803262 minutes
16300 of 32317, time elapsed: 966.2289097825686 minutes
16400 of 32317, time elapsed: 972.3264740188916 minutes
16500 of 32317, time elapsed: 978.4427053491274 min

29300 of 32317, time elapsed: 1761.2336700876554 minutes
29400 of 32317, time elapsed: 1767.3213307817778 minutes
29500 of 32317, time elapsed: 1773.4114335338274 minutes
29600 of 32317, time elapsed: 1779.5038827498754 minutes
29700 of 32317, time elapsed: 1785.5956211328507 minutes
29800 of 32317, time elapsed: 1791.699254500866 minutes
29900 of 32317, time elapsed: 1797.6886399507523 minutes
30000 of 32317, time elapsed: 1803.2078956842422 minutes
30100 of 32317, time elapsed: 1808.7142644842465 minutes
30200 of 32317, time elapsed: 1814.2976399501165 minutes
30300 of 32317, time elapsed: 1820.2149342536927 minutes
30400 of 32317, time elapsed: 1826.3389734665552 minutes
30500 of 32317, time elapsed: 1832.5031290690104 minutes
30600 of 32317, time elapsed: 1838.6169268210729 minutes
30700 of 32317, time elapsed: 1844.7459008812905 minutes
30800 of 32317, time elapsed: 1850.8868427832922 minutes
30900 of 32317, time elapsed: 1857.0214422663053 minutes
31000 of 32317, time elapsed: 18

In [9]:
df.shape

(32317, 12)