In [1]:
import pandas as pd
import csv
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt

### Import the cleaned data from MapReduce

In [2]:
pwd = os.getcwd()
tweets = pd.read_csv(pwd + "/tweet_output/part-00000.txt", header=None, index_col=0, sep=",", quoting=csv.QUOTE_NONE)

In [3]:
print(tweets.shape)

(1600000, 2)


In [4]:
tweets.iloc[1599999]

1                        Thu May 21 23:36:24 PDT 2009 
2     MsTeagan and by the way I rewatched Sun Godde...
Name: 999999, dtype: object

In [5]:
tweets.head()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Mon Apr 06 22:19:45 PDT 2009,switchfoot httptwitpiccom2y1zl Awww thats a ...
1,Mon Apr 06 22:19:49 PDT 2009,is upset that he cant update his Facebook by ...
10,Mon Apr 06 22:20:16 PDT 2009,spring break in plain city its snowing\t
100,Mon Apr 06 22:26:33 PDT 2009,Body Of Missing Northern Calif Girl Found Po...
1000,Mon Apr 06 23:28:50 PDT 2009,JonathanRKnight silver_tulip27 Um that would ...


In [6]:
tweets.tail()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
999995,Thu May 21 23:36:19 PDT 2009,healingsinger thank you i needed that \t
999996,Thu May 21 23:36:20 PDT 2009,vactress httpbitlycADea Maybe this is more y...
999997,Thu May 21 23:36:21 PDT 2009,Brat13 Hell Windows 7 will be out of my price...
999998,Thu May 21 23:36:22 PDT 2009,jigardoshi neah i wish just reminiscing read...
999999,Thu May 21 23:36:24 PDT 2009,MsTeagan and by the way I rewatched Sun Godde...


In [7]:
tweets = tweets.reset_index(drop=True)

In [8]:
tweets = tweets.rename(columns = {1:'Date', 2:'Tweet'})

In [9]:
tweets.head()

Unnamed: 0,Date,Tweet
0,Mon Apr 06 22:19:45 PDT 2009,switchfoot httptwitpiccom2y1zl Awww thats a ...
1,Mon Apr 06 22:19:49 PDT 2009,is upset that he cant update his Facebook by ...
2,Mon Apr 06 22:20:16 PDT 2009,spring break in plain city its snowing\t
3,Mon Apr 06 22:26:33 PDT 2009,Body Of Missing Northern Calif Girl Found Po...
4,Mon Apr 06 23:28:50 PDT 2009,JonathanRKnight silver_tulip27 Um that would ...


In [10]:
tweets.Date = pd.to_datetime(tweets.Date)

  tweets.Date = pd.to_datetime(tweets.Date)


In [90]:
#Strip the time as the sentiment will be calculated per day
#tweets.Date = pd.to_datetime(tweets.Date).dt.date

  tweets.Date = pd.to_datetime(tweets.Date).dt.date


In [11]:
tweets.head()

Unnamed: 0,Date,Tweet
0,2009-04-06 22:19:45,switchfoot httptwitpiccom2y1zl Awww thats a ...
1,2009-04-06 22:19:49,is upset that he cant update his Facebook by ...
2,2009-04-06 22:20:16,spring break in plain city its snowing\t
3,2009-04-06 22:26:33,Body Of Missing Northern Calif Girl Found Po...
4,2009-04-06 23:28:50,JonathanRKnight silver_tulip27 Um that would ...


### Sentiment analysis

In [12]:
sentiment = SentimentIntensityAnalyzer()

In [13]:
def format_output(output_dict):
    polarity = "neutral"

    if(output_dict['compound'] >= 0.05):
        polarity = "positive"

    elif(output_dict['compound'] <= -0.05):
        polarity = "negative"

    return polarity

def get_score(output_dict):
    return output_dict['compound']

def predict_sentiment(text):
    output_dict =  sentiment.polarity_scores(text)
    return format_output(output_dict)

def predict_score(text):
    output_dict =  sentiment.polarity_scores(text)
    return get_score(output_dict)

In [94]:
tweets["vader_prediction"] = tweets["Tweet"].apply(predict_sentiment)

In [95]:
tweets.head()

Unnamed: 0,Date,Tweet,vader_prediction
0,2009-04-06,switchfoot httptwitpiccom2y1zl Awww thats a ...,negative
1,2009-04-06,is upset that he cant update his Facebook by ...,negative
2,2009-04-06,spring break in plain city its snowing\t,neutral
3,2009-04-06,Body Of Missing Northern Calif Girl Found Po...,negative
4,2009-04-06,JonathanRKnight silver_tulip27 Um that would ...,negative


In [14]:
tweets["vader_score"] = tweets["Tweet"].apply(predict_score)

In [15]:
tweets.head()

Unnamed: 0,Date,Tweet,vader_score
0,2009-04-06 22:19:45,switchfoot httptwitpiccom2y1zl Awww thats a ...,-0.3818
1,2009-04-06 22:19:49,is upset that he cant update his Facebook by ...,-0.7269
2,2009-04-06 22:20:16,spring break in plain city its snowing\t,0.0
3,2009-04-06 22:26:33,Body Of Missing Northern Calif Girl Found Po...,-0.5267
4,2009-04-06 23:28:50,JonathanRKnight silver_tulip27 Um that would ...,-0.8519


### Time-series

In [121]:
tweets_average = tweets.groupby(["Date"], as_index = False)["vader_score"].mean()

In [122]:
tweets_average

Unnamed: 0,Date,vader_score
0,2009-04-06 22:19:45,-0.381800
1,2009-04-06 22:19:49,-0.726900
2,2009-04-06 22:19:53,0.493900
3,2009-04-06 22:19:57,-0.454850
4,2009-04-06 22:20:00,0.000000
...,...,...
774358,2009-06-25 10:28:26,-0.137540
774359,2009-06-25 10:28:27,0.127267
774360,2009-06-25 10:28:28,-0.388700
774361,2009-06-25 10:28:30,0.194400


In [99]:
tweets_byday = tweets.groupby(["Date"], as_index=False)["vader_score"].mean()

In [100]:
tweets_byday

Unnamed: 0,Date,vader_score
0,2009-04-06,0.140128
1,2009-04-07,0.156389
2,2009-04-17,0.194937
3,2009-04-18,0.17363
4,2009-04-19,0.177163
5,2009-04-20,0.165969
6,2009-04-21,0.169604
7,2009-05-01,0.169209
8,2009-05-02,0.173652
9,2009-05-03,0.165379


In [None]:
fig, ax=plt.subplots(figsize=(20, 12))

ax.plot(tweets_average["Date"], tweets_average["vader_score"])

plt.ylim([-.5,.5])
plt.grid()

plt.xlabel("Date", fontsize = 18)
plt.ylabel("Sentiment score", fontsize = 18)

plt.show
#data_test['y'].plot(ax=ax, label='test')
#ax.legend();

<function matplotlib.pyplot.show(close=None, block=None)>

In [27]:
len(tweets[1].unique())

774363