### Imports

In [1]:
import nltk
from nltk.corpus import stopwords

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [5]:
import statistics as stat

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

### Functions

In [7]:
# Function to get mean, median, min, max, and standard deviation of input
def get_stats(info):
    me = stat.mean(info)
    med = stat.median(info)
    mini = min(info)
    maxi = max(info)
    sdev = stat.stdev(info)
    
    return me,med,mini,maxi,sdev

In [8]:
# Function to get sentiment polarities
def get_sentiment(tweets):
    # Lists for each category of sentiment polarity
    neg = [] # negative
    pos = [] # positive
    neu = [] # neutral
    
    # Run the sentiment intensity analysis for input
    for t in tweets:
        # Get polarity scores for each tweet
        tmp_sia = analyzer.polarity_scores(t)
        # Variable assignment for the negative, positive, 
        # and neutral scores
        tmp_neg = tmp_sia['neg']
        tmp_pos = tmp_sia['pos']
        tmp_neu = tmp_sia['neu']
        
        # Append each tweet's neg, pos,and neu scores to
        # their respective lists
        neg.append(tmp_neg)
        pos.append(tmp_pos)
        neu.append(tmp_neu)
    
    # Return the lists
    return neg,pos,neu

In [9]:
''' Function to take in tweets from each date, get
the sentiment scores, and get summary statistics 
from those sentiment scores.'''

def namedate(namedate):
    # Create a list of lists for sentiments
    tmp = [get_sentiment(namedate)]
    # Get summary stats 
    tmp_neg = get_stats(tmp[0][0])
    tmp_pos = get_stats(tmp[0][1])
    tmp_neu = get_stats(tmp[0][2])
    # Make a list of lists for all summary stats
    tmp_sents = [tmp_neg,tmp_pos,tmp_neu]
    # Convert to numpy array
    sents = np.array(tmp_sents)
    # Return the converted numpy array
    return sents

I gathered data using [GetOldTweets3](https://pypi.org/project/GetOldTweets3/).

### Read in data

In [10]:
biden0808 = pd.read_csv('csv/biden0808.csv')['text']
biden0815 = pd.read_csv('csv/biden0815.csv')['text']
biden0827 = pd.read_csv('csv/biden0827.csv')['text']
biden0907 = pd.read_csv('csv/biden0907.csv')['text']
biden0911 = pd.read_csv('csv/biden0911.csv')['text']
biden0912 = pd.read_csv('csv/biden0912.csv')['text']
biden0917 = pd.read_csv('csv/biden0917.csv')['text']
biden0921 = pd.read_csv('csv/biden0921.csv')['text']
biden0924 = pd.read_csv('csv/biden0924.csv')['text']

In [11]:
bidens = [biden0808,biden0815,biden0827,biden0907,biden0911,biden0912,biden0917,biden0921,biden0924]

In [12]:
warren0808 = pd.read_csv('csv/warren0808.csv')['text']
warren0815 = pd.read_csv('csv/warren0815.csv')['text']
warren0827 = pd.read_csv('csv/warren0827.csv')['text']
warren0907 = pd.read_csv('csv/warren0907.csv')['text']
warren0911 = pd.read_csv('csv/warren0911.csv')['text']
warren0912 = pd.read_csv('csv/warren0912.csv')['text']
warren0917 = pd.read_csv('csv/warren0917.csv')['text']
warren0921 = pd.read_csv('csv/warren0921.csv')['text']
warren0924 = pd.read_csv('csv/warren0924.csv')['text']

In [13]:
warrens = [warren0808,warren0815,warren0827,warren0907,warren0911,warren0912,warren0917,warren0921,warren0924]

In [14]:
sanders0808 = pd.read_csv('csv/sanders0808.csv')['text']
sanders0815 = pd.read_csv('csv/sanders0815.csv')['text']
sanders0827 = pd.read_csv('csv/sanders0827.csv')['text']
sanders0907 = pd.read_csv('csv/sanders0907.csv')['text']
sanders0911 = pd.read_csv('csv/sanders0911.csv')['text']
sanders0912 = pd.read_csv('csv/sanders0912.csv')['text']
sanders0917 = pd.read_csv('csv/sanders0917.csv')['text']
sanders0921 = pd.read_csv('csv/sanders0921.csv')['text']
sanders0924 = pd.read_csv('csv/sanders0924.csv')['text']

In [15]:
sanderss = [sanders0808,sanders0815,sanders0827,sanders0907,sanders0911,sanders0912,sanders0917,sanders0921,sanders0924]

### Manipulate data

#### Biden

In [16]:
biden_stats = [namedate(b) for b in bidens]
biden_np = np.array(biden_stats).reshape(9,15)

In [18]:
warren_stats = [namedate(w) for w in warrens]
warren_np = np.array(warren_stats).reshape(9,15)

In [19]:
sanders_stats = [namedate(s) for s in sanderss]
sanders_np = np.array(sanders_stats).reshape(9,15)

In [72]:
# For Tableau visualization
b_neg_means = []
b_pos_means = []
w_neg_means = []
w_pos_means = []
s_neg_means = []
s_pos_means = []

for i in range(0,9):
    b_neg_means.append(biden_np[i][0])
    b_pos_means.append(biden_np[i][5])
    w_neg_means.append(warren_np[i][0])
    w_pos_means.append(warren_np[i][5])
    s_neg_means.append(sanders_np[i][0])
    s_pos_means.append(sanders_np[i][5])

b_means_df = pd.DataFrame([b_neg_means,b_pos_means],
                        columns=['Aug8','Aug15',
                                 'Aug27','Sept7',
                                 'Sept11','Sept12',
                                'Sept17','Sept21','Sept24'],
                       index=['Negative','Positive'])
w_means_df = pd.DataFrame([w_neg_means,w_pos_means],
                        columns=['Aug8','Aug15',
                                 'Aug27','Sept7',
                                 'Sept11','Sept12',
                                'Sept17','Sept21','Sept24'],
                       index=['Negative','Positive'])
s_means_df = pd.DataFrame([s_neg_means,s_pos_means],
                        columns=['Aug8','Aug15',
                                 'Aug27','Sept7',
                                 'Sept11','Sept12',
                                'Sept17','Sept21','Sept24'],
                       index=['Negative','Positive'])

In [71]:
# Writing pandas DataFrames to csv files for Tableau
b_means_df.to_csv('b_means.csv')
w_means_df.to_csv('w_means.csv')
s_means_df.to_csv('s_means.csv')

### Rankings
* August 8 via SurveyUSA: 
    Biden, Sanders, Warren
    
* August 15 order for likely voters via Fox News: 
    Biden, Warren, Sanders
    
* August 27 LV via Emerson College: 
    Biden, Sanders, Warren
    
* September 7 LV via Suffolk University: 
    Biden, Sanders, Warren 
    
* September 11 via RKM Research and Communications Inc.: 
    Sanders, Biden, Warren
    
* September 12 LV via YouGov: 
    Biden, Warren, Sanders 
    
* September 17 LV via NBC News/Wall Street Journal: 
    Biden, Warren, Sanders
    
* September 21 LV via Selzer and Co: 
    Warren, Biden, Sanders
    
* September 24 LV via Monmouth University: 
    Warren, Biden, Sanders

In [20]:
# One array per candidate in chronological order
# 0 = 1st place; 1 = 2nd place; 2 = 3rd place

b_target = np.array([0,0,0,0,1,0,0,1,1])
s_target = np.array([1,2,1,1,0,2,2,2,2])
w_target = np.array([2,1,2,2,2,1,1,0,0])

In [21]:
BX_train, BX_test, by_train, by_test = train_test_split(biden_np, b_target, test_size=0.33, random_state=42)

In [22]:
WX_train, WX_test, wy_train, wy_test = train_test_split(warren_np, w_target, test_size=0.33, random_state=42)

In [23]:
SX_train, SX_test, sy_train, sy_test = train_test_split(sanders_np, s_target, test_size=0.33, random_state=42)

### Classifiers

In [24]:
clf_nb = GaussianNB()
clf_linsvc = LinearSVC()
clf_dt = tree.DecisionTreeClassifier()
clf_knn =  KNeighborsClassifier(n_neighbors=3)
clf_nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

### Model Selection

In [25]:
b_crossval = [stat.mean(cross_val_score(clf_nb, biden_np, b_target, cv=3)),
              stat.mean(cross_val_score(clf_linsvc, biden_np, b_target, cv=3)),
              stat.mean(cross_val_score(clf_dt, biden_np, b_target,cv=3)),
              stat.mean(cross_val_score(clf_knn, biden_np, b_target,cv=3)),
              stat.mean(cross_val_score(clf_nn, biden_np, b_target,cv=3))]

In [26]:
pd.DataFrame(b_crossval,index=['Naive Bayes','Linear SVC','Decision Tree','K-Nearest Neighbors','Neural Network'],columns=['Avg 3fold cross-val score'])

Unnamed: 0,Avg 3fold cross-val score
Naive Bayes,0.666667
Linear SVC,0.666667
Decision Tree,0.666667
K-Nearest Neighbors,0.666667
Neural Network,0.666667


In [27]:
w_crossval = [stat.mean(cross_val_score(clf_nb, warren_np, w_target, cv=3)),
              stat.mean(cross_val_score(clf_linsvc, warren_np, w_target, cv=3)),
              stat.mean(cross_val_score(clf_dt, warren_np, w_target,cv=3)),
              stat.mean(cross_val_score(clf_knn, warren_np, w_target,cv=3)),
              stat.mean(cross_val_score(clf_nn, warren_np, w_target,cv=3))]



In [28]:
pd.DataFrame(w_crossval,index=['Naive Bayes','Linear SVC','Decision Tree','K-Nearest Neighbors','Neural Network'],columns=['Avg 3fold cross-val score'])

Unnamed: 0,Avg 3fold cross-val score
Naive Bayes,0.611111
Linear SVC,0.361111
Decision Tree,0.25
K-Nearest Neighbors,0.361111
Neural Network,0.444444


In [29]:
s_crossval = [stat.mean(cross_val_score(clf_nb, sanders_np, s_target,cv=3)),
              stat.mean(cross_val_score(clf_linsvc, sanders_np, s_target,cv=3)),
              stat.mean(cross_val_score(clf_dt, sanders_np, s_target,cv=3)),
              stat.mean(cross_val_score(clf_knn, sanders_np, s_target,cv=3)),
              stat.mean(cross_val_score(clf_nn, sanders_np, s_target,cv=3))]



In [30]:
pd.DataFrame(s_crossval,index=['Naive Bayes','Linear SVC','Decision Tree','K-Nearest Neighbors','Neural Network'],columns=['Avg 3fold cross-val score'])

Unnamed: 0,Avg 3fold cross-val score
Naive Bayes,0.222222
Linear SVC,0.555556
Decision Tree,0.0
K-Nearest Neighbors,0.166667
Neural Network,0.388889


Overall, the best performing classifiers are Naive Bayes, Linear SVC, and the Neural Network, while the worst performing classifier overall was the Decision Tree model.

* Biden: all models apart from Decision Tree perform similarly.
* Warren: Naive Bayes outperforms all other models, followed by the Neural Network. KNN and Linear SVC are tied for third best model
* Sanders: Linear SVC outperforms all other models, followed by the Neural Network. Naive Bayes did not perform well here.

### Model Evaluation

#### Biden

In [31]:
clf_linsvc.fit(BX_train,by_train)
predict_lin_b = clf_linsvc.predict(BX_test)
confusion_matrix(by_test,predict_lin_b)

array([[2, 0],
       [1, 0]])

In [32]:
predict_lin_b

array([0, 0, 0])

In [33]:
clf_nb.fit(BX_train,by_train)
predict_nb_b = clf_nb.predict(BX_test)
confusion_matrix(by_test,predict_nb_b)

array([[2, 0],
       [1, 0]])

In [34]:
predict_nb_b

array([0, 0, 0])

#### Warren

In [35]:
clf_linsvc.fit(WX_train,wy_train)
predict_lin_w = clf_linsvc.predict(WX_test)
confusion_matrix(wy_test,predict_lin_w)

array([[0, 0, 1],
       [0, 0, 2],
       [0, 0, 0]])

In [36]:
predict_lin_w

array([2, 2, 2])

In [37]:
clf_nb.fit(WX_train,wy_train)
predict_nb_w = clf_nb.predict(WX_test)
confusion_matrix(wy_test,predict_nb_w)

array([[0, 0, 1],
       [0, 0, 2],
       [0, 0, 0]])

In [38]:
predict_nb_w

array([2, 2, 2])

#### Sanders

In [39]:
clf_linsvc.fit(SX_train,sy_train)
predict_lin_s = clf_linsvc.predict(SX_test)
confusion_matrix(sy_test,predict_lin_s)

array([[0, 0],
       [3, 0]])

In [40]:
predict_lin_s

array([1, 1, 1])

In [41]:
clf_nb.fit(SX_train,sy_train)
predict_nb_s = clf_nb.predict(SX_test)
confusion_matrix(sy_test,predict_nb_s)

array([[0, 0],
       [3, 0]])

In [42]:
predict_nb_s

array([1, 1, 1])

None of the learning algorithms perform particularly well, indicating that there is not a strong link between Twitter sentiment and opinion polls. The two populations have mostly different opinions.

The consensus of the predictions are Biden 1st, Sanders 2nd, and Warren 3rd. That said, they are not particularly reliable predictions. 