### Import necessary libraries

In [1]:
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
import numpy as np
import scipy.stats as st
import math

### Read the sentiment file for the tweets collected from US

In [2]:
UStweets = pd.read_csv("Data Dump\CleanTweets\FP1_Merged_US_Sentiments.csv")
UStweets.head()

Unnamed: 0,id,created_at,truncated,url,location_US_State,geo_US_State,place_US_State,createDate,clean_txt,TB_polarity,TB_subjectivity,AF_polarity,TB_sentiment
0,1217234682309464064,2020-01-14T23:58:58,False,[],New Hampshire,,,2020-01-14,A PROVEN LIAR Trump said HE D PAY LEGAL FEES ...,0.2,0.2,-5.0,Neutral
1,1217197393948692482,2020-01-14T21:30:47,False,[],Florida,,,2020-01-14,2 things are often true there is enough evide...,0.45,0.716667,5.0,Positive
2,1217197457270104066,2020-01-14T21:31:02,True,[https://t.co/5R55ma3Efz],New Jersey,,,2020-01-14,Whether you are a Democrat or Republican Trump...,0.0,0.0,1.0,Neutral
3,1217197459857866753,2020-01-14T21:31:03,False,[],Virginia,,,2020-01-14,pubs You don t say Trump Adviser Caught on Ta...,0.0,0.0,-2.0,Neutral
4,1217197467441213441,2020-01-14T21:31:05,False,[],Texas,,,2020-01-14,So the shooting in Pensacola WAS a terrorist ...,-0.8,1.0,-3.0,Strongly Negative


### For some of the tweets, the location is at a County level, find the corresponding state

In [3]:
def fncFindStateByCoordinates(geo):
    geolocator = Nominatim(user_agent ="FP1_Convert_Tweets")
    location = geolocator.geocode(geo)
    if str(location).split(',')[-1] ==" United States":
        state = str(location).split(',')[-2].strip()
    return state 

### The tweets had three location attributes - geo, place and location. Identify, which of these is the valid one to use.

In [None]:
stateList = []
for iRow in range(len(UStweets)):
    state = ""
    location = (UStweets.iloc[iRow]['location_US_State'])
    geo = (UStweets.iloc[iRow]['geo_US_State'])
    place = (UStweets.iloc[iRow]['place_US_State'])
    if (str(geo).endswith("County")):
        geo = fncFindStateByCoordinates(str(geo))
    if ((str(geo) != "nan")):
        state = str(geo)
    elif ((str(place) != "nan") and (str(place) != "United States of America")):
        state = str(place)
    elif ((str(location) != "United States of America")): 
        state = str(location)
    else:
        state = ""
    if str(state) == "nan":
        state = ""
    stateList.append(state)
UStweets['State'] = pd.Series(stateList)

### Read the Election Results to identify the states. 

In [None]:
Results = pd.read_csv("Data Dump\FP1_2020_USElections_StateResults_1126.txt", header= None, 
                      names = ['State','Tot_Population', 'FirstName', 'LastName', 'Party', 'VoteCount'],
                      dtype={'State': str, 'Tot_Population': int, 'FirstName': str, 'LastName': str, 'Party': str, 'VoteCount':int}
                     )
Results.head()

In [None]:
# Filter only the rows for Trump
TrumpVotes = Results[Results['LastName'] == 'Trump']

### From the USTweets, filter the rows that correspond to the states in the election results
##### There are some tweets from the Union Territorries which did not participate in the Election. Also, some tweets from US, which could not be mapped to specific states.

In [None]:
USStateTweets = UStweets[UStweets['State'].isin(list(TrumpVotes['State']))]

In [None]:
len(USStateTweets)

### For calculating the numbers at the country level, create new dataframe with all results.

In [None]:
CountryTweets = UStweets
CountryTweets['State'] = 'United States'

In [None]:
len(CountryTweets)

### Merge the 2 dataframes - one at the State level and the other at Country level, to form the final dataframe

In [None]:
finalTweets = USStateTweets.append(CountryTweets)

In [None]:
len(finalTweets)

### Create new dataframe for Statistical Model and remove all unwanted columns.
##### US Presidential Elections 2020 was held on 3rd November 2020

In [None]:
def fncFindLimits(tweetDF, preElectionInd, sentimentInd):
    # Group the data by State and Sentiment while taking the mean for the polarity and subjectivity scores, and summing up the count of tweets
    tweetStats = tweetDF[tweetDF['preElection'] == preElectionInd].groupby(['State', sentimentInd]).agg({'TB_polarity':np.mean, 'AF_polarity':np.mean, 'TB_subjectivity':np.mean, 'id': np.size}).sort_values(['State', sentimentInd])
    # Make the dataframe flat by removing the index
    tweetStats.reset_index(inplace=True)
    # Calculate the total sample size per state
    stateSampleSize = dict(tweetDF[tweetDF['preElection'] == preElectionInd].groupby(['State'])['id'].count())
    # Populate the statewise sample size
    tweetStats['State_SampleSize'] = tweetStats['State'].apply(lambda x: stateSampleSize.get(x))
    # Rename the columns to more meaningful
    tweetStats.columns = [['State',sentimentInd,'TB_polarity','AF_polarity','TB_subjectivity','Count','State_SampleSize']]
    # Calculate the proportion for each sentiment per state
    calcPct = lambda row : (int(row['Count'])/int(row['State_SampleSize']))
    tweetStats['Sample_sentiment_Pct'] = tweetStats.apply(lambda row: calcPct(row), axis=1)
    # Using Central Limit Theorem calculate the sample error
    calcError = lambda row : (st.norm.ppf(confInt) * math.sqrt(row['Sample_sentiment_Pct'] * (1 - row['Sample_sentiment_Pct'])/int(row['State_SampleSize'])))
    tweetStats['Sample_Error'] = tweetStats.apply(lambda row: calcError(row), axis=1)
    # Calculate the lower and upper limit for a 95% confidence interval for the population
    calcLCL = lambda row : row['Sample_sentiment_Pct'] - calcError(row)
    tweetStats['LCL'] = tweetStats.apply(lambda row:calcLCL(row), axis = 1)
    calcUCL = lambda row : row['Sample_sentiment_Pct'] + calcError(row)
    tweetStats['UCL'] = tweetStats.apply(lambda row:calcUCL(row), axis = 1)
    return(tweetStats)

In [None]:
confInt = 0.95
# Retain only the required columns
forStats = finalTweets[['id','createDate','State','TB_polarity','TB_subjectivity','AF_polarity','TB_sentiment']]
# We are interested in only the data from Sep 01, 2020
forStats = forStats[forStats['createDate'] > '2020-08-30']
# Add a column to identify whether the data is pre or post election
forStats['preElection'] = forStats['createDate'].apply(lambda x: 'Y' if (x <= '2020-11-03') else 'N')

preElectionStats = fncFindLimits(forStats, "Y", 'TB_sentiment')
postElectionStats = fncFindLimits(forStats, "N", 'TB_sentiment')

In [None]:
preElectionStats = fncFindLimits(forStats, "Y", 'TB_sentiment')
postElectionStats = fncFindLimits(forStats, "N", 'TB_sentiment')
preElectionStats.head()

In [None]:
postElectionStats.head()

In [None]:
UStweets['AF_polarity'].sort_values().head(5)

In [None]:
UStweets['AF_polarity'].sort_values().tail(5)

In [None]:
preElectionStats.to_csv("Data Dump\FP1_2020_Tweet_StatsModel_PreElection-1.csv", index = None)
postElectionStats.to_csv("Data Dump\FP1_2020_Tweet_StatsModel_PostElection-1.csv", index = None)

In [None]:
UStweets[UStweets['TB_sentiment'] == 'Neutral'].describe()

In [None]:
fncFindSentiment = lambda x: "Strongly Negative" if float(x) <= -0.60 else ("Negative" if float(x) <= -0.10 else ("Neutral" if float(x) <= 0.10 else ("Positive" if float(x) <= 0.60 else "Strongly Positive")))
forStats['TB_sentiment'] = forStats['TB_polarity'].apply(fncFindSentiment)

In [None]:
preElec = forStats[forStats['preElection']=='Y']
temp = preElec.groupby(['State']).agg({'TB_polarity':np.mean, 'AF_polarity':np.mean})
temp