In [1]:
import matplotlib.pyplot as plt
import re
import pandas as pd
import numpy as np
from scipy.signal import find_peaks
from typing import List
import string
from textblob import TextBlob

In [21]:
df = pd.read_csv('../data/tweets.csv')

df.head()

Unnamed: 0,author,content,country,date_time,id,language,latitude,longitude,number_of_likes,number_of_shares
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,,12/01/2017 19:52,8.19633e+17,en,,,7900,3472
1,katyperry,@barackobama Thank you for your incredible gra...,,11/01/2017 08:38,8.19101e+17,en,,,3689,1380
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,,11/01/2017 02:52,8.19014e+17,en,,,10341,2387
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,,11/01/2017 02:44,8.19012e+17,en,,,10774,2458
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,,10/01/2017 05:22,8.18689e+17,en,,,17620,4655


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52542 entries, 0 to 52541
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   author            52542 non-null  object 
 1   content           52542 non-null  object 
 2   country           36 non-null     object 
 3   date_time         52542 non-null  object 
 4   id                52542 non-null  float64
 5   language          52542 non-null  object 
 6   latitude          1 non-null      float64
 7   longitude         1 non-null      float64
 8   number_of_likes   52542 non-null  int64  
 9   number_of_shares  52542 non-null  int64  
dtypes: float64(3), int64(2), object(5)
memory usage: 4.0+ MB


In [23]:
df.drop(columns=df.columns.difference(['content', 'date_time']), inplace=True)

In [24]:
df.head()

Unnamed: 0,content,date_time
0,Is history repeating itself...?#DONTNORMALIZEH...,12/01/2017 19:52
1,@barackobama Thank you for your incredible gra...,11/01/2017 08:38
2,Life goals. https://t.co/XIn1qKMKQl,11/01/2017 02:52
3,Me right now 🙏🏻 https://t.co/gW55C1wrwd,11/01/2017 02:44
4,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,10/01/2017 05:22


In [25]:
df['date_time'] = pd.to_datetime(df.date_time)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52542 entries, 0 to 52541
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   content    52542 non-null  object        
 1   date_time  52542 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 821.1+ KB


In [27]:
def return_my_hashtag(series):
    content, date_time = series
    regex = r'#\w+'
    results = re.findall(regex, content)
    if len(results) > 0:
        return results
    return np.nan

In [28]:
df['hashtag'] = df.apply(return_my_hashtag, axis=1)

In [29]:
df

Unnamed: 0,content,date_time,hashtag
0,Is history repeating itself...?#DONTNORMALIZEH...,2017-12-01 19:52:00,[#DONTNORMALIZEHATE]
1,@barackobama Thank you for your incredible gra...,2017-11-01 08:38:00,
2,Life goals. https://t.co/XIn1qKMKQl,2017-11-01 02:52:00,
3,Me right now 🙏🏻 https://t.co/gW55C1wrwd,2017-11-01 02:44:00,
4,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,2017-10-01 05:22:00,
...,...,...,...
52537,Life couldn't be better right now. 😊,2015-06-01 23:10:00,
52538,First Monday back in action. I'd say 21.6 mile...,2015-06-01 02:17:00,
52539,"Crime shows, buddy, snuggles = the perfect Sun...",2015-05-01 03:42:00,
52540,❄️ http://t.co/sHCFdPpGPa,2015-05-01 00:06:00,


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52542 entries, 0 to 52541
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   content    52542 non-null  object        
 1   date_time  52542 non-null  datetime64[ns]
 2   hashtag    20561 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 1.2+ MB


In [31]:
df.dropna(axis=0, inplace=True)

In [32]:
df = df.explode('hashtag')

In [34]:
df

Unnamed: 0,content,date_time,hashtag
0,Is history repeating itself...?#DONTNORMALIZEH...,2017-12-01 19:52:00,#DONTNORMALIZEHATE
5,happy 96th gma #fourmoreyears! 🎈 @ LACMA Los A...,2017-09-01 01:00:00,#fourmoreyears
28,I dare you to find a better live vocal perform...,2016-01-12 07:08:00,#jenniferholliday
29,This #Thanksgiving #IStandWithStandingRock tex...,2016-11-24 22:31:00,#Thanksgiving
29,This #Thanksgiving #IStandWithStandingRock tex...,2016-11-24 22:31:00,#IStandWithStandingRock
...,...,...,...
52516,"""These are real pajamas."" ""Huh??"" ""You asked m...",2015-12-01 08:17:00,#sleeptalking
52518,I guess you could say Buddy was pretty tired a...,2015-12-01 07:43:00,#whitegirlwasted
52526,"Well, I didn't end up getting any brownie batt...",2015-11-01 06:35:00,#20FITteen
52530,I’m going to #color2015 with health and love 💪...,2015-10-01 01:19:00,#color2015


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25818 entries, 0 to 52531
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   content    25818 non-null  object        
 1   date_time  25818 non-null  datetime64[ns]
 2   hashtag    25818 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 806.8+ KB


In [35]:
def process_tweets_and_get_sentiment(tweet):
    # remove all the RT
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hash signs
    tweet = re.sub(r'#', '', tweet)
    # remove mentions
    tweet = re.sub(r'@[A-Za-z0-9]+', '', tweet)
    # remove links
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove punctuation
    tweet = re.sub(r'['+string.punctuation+']+', ' ', tweet)

    # finally get the sentiment score
    blob = TextBlob(tweet)
    return blob.sentiment.polarity

In [36]:
df['sentiment'] = df.content.apply(process_tweets_and_get_sentiment)

In [37]:
df

Unnamed: 0,content,date_time,hashtag,sentiment
0,Is history repeating itself...?#DONTNORMALIZEH...,2017-12-01 19:52:00,#DONTNORMALIZEHATE,0.000000
5,happy 96th gma #fourmoreyears! 🎈 @ LACMA Los A...,2017-09-01 01:00:00,#fourmoreyears,0.800000
28,I dare you to find a better live vocal perform...,2016-01-12 07:08:00,#jenniferholliday,0.318182
29,This #Thanksgiving #IStandWithStandingRock tex...,2016-11-24 22:31:00,#Thanksgiving,0.000000
29,This #Thanksgiving #IStandWithStandingRock tex...,2016-11-24 22:31:00,#IStandWithStandingRock,0.000000
...,...,...,...,...
52516,"""These are real pajamas."" ""Huh??"" ""You asked m...",2015-12-01 08:17:00,#sleeptalking,0.200000
52518,I guess you could say Buddy was pretty tired a...,2015-12-01 07:43:00,#whitegirlwasted,-0.183333
52526,"Well, I didn't end up getting any brownie batt...",2015-11-01 06:35:00,#20FITteen,0.000000
52530,I’m going to #color2015 with health and love 💪...,2015-10-01 01:19:00,#color2015,0.500000
