In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [2]:
#Read tweets from csv file
df=pd.read_csv("../Raw Data/Trump tweets 3 day groups jan-dec 2017.csv")
df.head()

Unnamed: 0,text,created_at,Unique Identifier
0,It all begins today! I will see you at 11:00 A...,1/20/2017,1
1,Today we are not merely transferring power fro...,1/20/2017,1
2,power from Washington D.C. and giving it back ...,1/20/2017,1
3,What truly matters is not which party controls...,1/20/2017,1
4,January 20th 2017 will be remembered as the da...,1/20/2017,1


In [3]:
#Changing the date format
for index, row in df.iterrows():
    #row['startdate']=row['startdate'].date()
    d= datetime.strptime(row['created_at'], '%m/%d/%Y')
    df.loc[index, 'created_at'] = d.strftime('%Y-%m-%d')
df.head()

Unnamed: 0,text,created_at,Unique Identifier
0,It all begins today! I will see you at 11:00 A...,2017-01-20,1
1,Today we are not merely transferring power fro...,2017-01-20,1
2,power from Washington D.C. and giving it back ...,2017-01-20,1
3,What truly matters is not which party controls...,2017-01-20,1
4,January 20th 2017 will be remembered as the da...,2017-01-20,1


In [4]:
#first tweet variable
#true for the first time then set to false
firstTweet = True

#list to store compound sentiment for each tweet
#once unique identifier is different averaged and emptied
compound_list =[]

#Array to store average compound sentiments
sentiments =[]
count =0

#Iterate over all the tweet data stored in dataframe
for index, row in df.iterrows():
    
    #Run Vader Analysis on each tweet
    sentence =row["text"]
    
    #Getting average compund sentiment for three days
    #Three days tweets have same unique identifier
    new_id =row["Unique Identifier"]
    
    #If not first tweet 
        ##Check if the old and new unique identifiers are different
            ####If different save the average compound sentiment
            ####Empty the compound list
            ####Add the average compound sentiment to an array
            ####Set the counter to 0
            ####Set the old_date  value
    #Else 
    ##Set firstTweet to false
    ##Set the ol_date for first set of sentiments 
    if(firstTweet == False):               
        if(old_id != new_id):
            compound= np.mean(compound_list)
            compound_list =[]
            sentiments.append({"Compound":compound,
                "Date" : old_date,
                "Count" : count})
            count =0
            old_date = row["created_at"]
    else:
        firstTweet = False
        old_date = row["created_at"]
    
    #Add all the tweet to compound list
    #Increment the counter
    #Change the old_id and old_values     
    compound_list.append(analyzer.polarity_scores(sentence)["compound"])
    count +=1
    old_id = new_id
        
#Add the final set of average compund sentiment
sentiments.append({"Compound":compound,
                "Date" : old_date,
                "Count" : count})

In [5]:
#Put all data into dataframe sentiments_df
sentiments_df = pd.DataFrame.from_dict(sentiments)
sentiments_df =sentiments_df[['Date','Compound','Count']]
sentiments_df.tail()

Unnamed: 0,Date,Compound,Count
111,2017-12-19,0.587037,16
112,2017-12-22,0.302471,21
113,2017-12-25,0.60749,10
114,2017-12-28,0.180156,25
115,2017-12-31,0.180156,1


In [6]:
#Read tweets approval ratings from csv file
df1=pd.read_csv("../Raw Data/approval data clean values only.csv")
df1.head()

Unnamed: 0,Start Date,End Date,Average of adjusted_approve,Average of adjusted_disapprove
0,1/20/2017,1/22/2017,45.90613,43.14093
1,1/23/2017,1/25/2017,43.71903,40.23624
2,1/26/2017,1/28/2017,42.90613,49.14093
3,1/29/2017,1/31/2017,43.90613,50.14093
4,2/1/2017,2/3/2017,44.90613,48.14093


In [7]:
#Changing the date format
#Keeping only one date and renaming it to 'Date'
#Filtering the data to keep values only from 2017-01-20 to 2017-12-31
for index, row in df1.iterrows():
    d= datetime.strptime(row['Start Date'], '%m/%d/%Y')
    df1.loc[index, 'Start Date'] = d.strftime('%Y-%m-%d')
df1.rename( columns={"Start Date": "Date"}, inplace=True)
df1=df1.loc[(df1['Date'] > '2017-01-19') & (df1['Date'] < '2018-01-01')]
approval_df=df1[['Date','Average of adjusted_approve','Average of adjusted_disapprove']]
approval_df.tail()

Unnamed: 0,Date,Average of adjusted_approve,Average of adjusted_disapprove
104,2017-12-13,37.614145,57.895265
105,2017-12-16,36.90613,57.14093
106,2017-12-19,36.90613,56.14093
107,2017-12-28,40.90613,53.14093
108,2017-12-31,39.594297,54.633113


In [8]:
#Performing merge based on the 'Date' column
#Removing any rows having NaN values or no values
correlate_df =sentiments_df.merge(approval_df,on='Date',how="outer")
correlate_df =correlate_df[['Date','Compound','Average of adjusted_approve','Average of adjusted_disapprove']]
correlate_df.rename( columns={"Average of adjusted_approve": "Approve","Average of adjusted_disapprove": "Disapprove"}, inplace=True)
correlate_df = correlate_df.dropna(axis=0, how='any')
correlate_df.tail()

Unnamed: 0,Date,Compound,Approve,Disapprove
109,2017-12-13,0.541981,37.614145,57.895265
110,2017-12-16,0.342439,36.90613,57.14093
111,2017-12-19,0.587037,36.90613,56.14093
114,2017-12-28,0.180156,40.90613,53.14093
115,2017-12-31,0.180156,39.594297,54.633113


In [9]:
len(correlate_df)

109

In [11]:
correlate_df.to_csv("../Clean Data/TweetVsApproval.csv")