In [1]:
## This notebook is for reproducing the results from my Galvanize DSI capstone.
## Let GeoStream.py run in a terminal to populate the stream_tweets directory until you have around 8gb
## of tweets (this is assuming you haven't filtered out any metadata)


In [2]:
import DistrictDict as dd
import tweets_to_df as t2df
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import re
import preprocess_twitter as pre
from collections import deque

In [6]:
#Loads tweets from stream_tweets file into df format, then saves it to a pikl file.
t2df.tweets2df()

In [7]:
tweets_df = t2df.get_tweets()

In [8]:
#1 = Republican, 0 = Democrats
y = tweets_df.party
#Treating all tweets from each district as one "document" in a binary classification task. 
x = [" ".join(i) for i in tweets_df['text']]
#Stanford pre-processing script
x = [pre.tokenize(i) for i in x]
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [15]:
# Can we predict the party of representatives from a district solely based off of a Bag of Words or Tf-idf
# and Naive Bayes approach?
print(len(x_train))
print(len(x_test))
#Feature Extraction
bag_of_words = CountVectorizer(stop_words="english",max_df=.95,min_df=20).fit_transform(x_train)
tfidf = TfidfVectorizer(stop_words='english',max_df=.95,min_df=20).fit_transform(x_train)

216
73


In [16]:
## Performing 3-fold cross validation for all methods
## if you have too few tweets these will throw up NaN/infinity input errors. 
scores = cross_val_score(MultinomialNB(),bag_of_words,y_train)
print(np.mean(scores))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [17]:
scores = cross_val_score(MultinomialNB(),tfidf,y_train)
print(np.mean(scores))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
scores = cross_val_score(SGDClassifier(loss='hinge', penalty='l2',
                                       alpha=1e-3, random_state=42,
                                       max_iter=1000, tol=None),bag_of_words,y_train)
print(np.mean(scores))

In [None]:
scores = cross_val_score(SGDClassifier(loss='hinge', penalty='l2',
                                       alpha=1e-3, random_state=42,
                                       max_iter=1000, tol=None),tfidf,y_train)
print(np.mean(scores))

In [None]:
#Getting percentage of Republican districts to make sure 
#we have balanced enough classes that accuracy is a good measure
sum(y_train==1)/len(y_train)

In [None]:
#Try some dimensionality reduction of tfidf matrix. 
lsa_tfidf = TruncatedSVD(n_components=50).fit_transform(tfidf)
scores = cross_val_score(SGDClassifier(loss='hinge', penalty='l2',
                                       alpha=1e-3,random_state=42,
                                       max_iter=20, tol=None),lsa_tfidf,y_train)
print(np.mean(scores))

In [None]:
#
scores = cross_val_score(CatBoostClassifier(),lsa_tfidf,y_train)