### Load packages

In [281]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.model_selection import KFold
from sklearn import metrics

import re

stopwords = nltk.corpus.stopwords.words("english") 
stemmer = SnowballStemmer("english")

### function to read data

In [282]:
def ReadData():
    address = 'C:/Users/dfontaine/OneDrive - Gogo LLC/TwitterProject/all_tweets.csv'
    df = pd.read_csv(address, sep='`',encoding = "utf-8")
    #remove retweets
    df = df[df['retweeted']=='False']
    #remove independents
    df = df[(df['Party'] == 'Republican') | (df['Party'] == 'Democratic')]
    df = df[['full_text','Party']]
    df = df.reset_index(drop=True)
    return(df)

### function to pre-process text (remove stop-words and stemming)

In [284]:
def preprocessing(doc, lower = True, stop_word = True, punctuation = True, word = True, stem = True, removeDigs = True, removeURLs = True): 
    ''' 
    doc: one single document (string)
    lower: convert words to lowercase
    punctuation: remove punctuation?????????????????????????????
    word: whether we want to word tokenize or sentence tokenize 
    stem: whether we want to stem  
    steps: tokenize -> remove stopwords and digits -> remove punctuation -> stem 
    ''' 
    if lower: 
        doc = doc.lower()
    if removeURLs:
        doc = re.sub('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','',doc)
    #remove ambersand
    doc = re.sub('\&amp;','',doc)
    if removeDigs:
        doc = ''.join([i for i in doc if not i.isdigit()])
    if word: 
        if punctuation: 
            from nltk.tokenize import RegexpTokenizer 
            tokenizer = RegexpTokenizer(r'\w+') 
            tokens = tokenizer.tokenize(doc)    
    else: 
        tokens = nltk.sent_tokenize(doc)   
    if stop_word: 
        tokens = [i for i in tokens if i not in stopwords]   
    if stem: 
        stems = [stemmer.stem(t) for t in tokens]
    return stems 

### function to get word count matrix

In [285]:
def getCountMatrixfromdocs(df, maxDF=0.8, minDF=0.01, maxFeat=10000, maxGram=1):
    labels = np.array(df['Party'].values.tolist())
    doc = df['full_text'].values.tolist()     # Converts tweets column into list of tweets
    count_vectorizer = CountVectorizer(max_df = maxDF, max_features = maxFeat, min_df = minDF, stop_words = None, tokenizer = preprocessing, ngram_range = (1,maxGram)) 
    count_matrix = count_vectorizer.fit_transform(doc) 
    count_matrix = pd.DataFrame(count_matrix.todense())
    #add column names to CountMatrix
    Terms = [pair[1] for pair in enumerate(count_vectorizer.get_feature_names())]
    count_matrix.columns = Terms
    
    return(count_matrix, labels, count_vectorizer, Terms)


#####################################################
#CREATING COUNT MATRIX FROM A NEW TWEET
#####################################################
#a,b,c,d = getCountMatrixfromdocs(df)

#abc = c
#c.vocabulary = d

#count_matrix = c.fit_transform(['Dustin has access to the aca in america as of yesterday'])
#count_matrix = pd.DataFrame(count_matrix.todense())
#Terms = [pair[1] for pair in enumerate(c.get_feature_names())]
#count_matrix.columns = Terms
#print(count_matrix)

### function to fit and score Naive Bayes classifer

In [286]:
def fitNBevaluate(X, y, cv_folds):
    kf = KFold(n_splits=cv_folds)
    acc = []
    fOne = []
    confMat = []
    X = X.values
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = MultinomialNB().fit(X_train, y_train)
        predicted = clf.predict(X_test)

        acc.append(metrics.accuracy_score(y_test,predicted))
        fOne.append(metrics.f1_score(y_test, predicted, average='weighted'))
        confMat.append(metrics.confusion_matrix(y_test,predicted, labels=['Republican','Democratic']))
    return(acc,fOne, confMat)

### function to fit and score Logistic classifer

In [287]:
def fitLevaluate(X, y, cv_folds, CW=None):
    kf = KFold(n_splits=cv_folds)
    acc = []
    fOne = []
    confMat = []
    X = X.values
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = LogisticRegression(class_weight=CW).fit(X_train, y_train)
        predicted = clf.predict(X_test)

        acc.append(metrics.accuracy_score(y_test,predicted))
        fOne.append(metrics.f1_score(y_test, predicted, average='weighted'))
        confMat.append(metrics.confusion_matrix(y_test,predicted, labels=['Republican','Democratic']))
    return(acc,fOne, confMat)

### function to fit and score KNN classifer

In [288]:
def fitKNNevaluate(X, y, cv_folds, K=1):
    kf = KFold(n_splits=cv_folds)
    acc = []
    fOne = []
    confMat = []
    X = X.values
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = KNeighborsClassifier(n_neighbors=K).fit(X_train, y_train)
        predicted = clf.predict(X_test)

        acc.append(metrics.accuracy_score(y_test,predicted))
        fOne.append(metrics.f1_score(y_test, predicted, average='weighted'))
        confMat.append(metrics.confusion_matrix(y_test,predicted, labels=['Republican','Democratic']))
    return(acc,fOne, confMat)

### function to fit and score TREE classifier

In [289]:
def fitTREEevaluate(X, y, cv_folds):
    kf = KFold(n_splits=cv_folds)
    acc = []
    fOne = []
    confMat = []
    X2 = X
    X = X.values
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = tree.DecisionTreeClassifier().fit(X_train, y_train) #previously had max_depth=20
        predicted = clf.predict(X_test)
        
        #with open("xTREExTESTx.dot", 'w') as f:
        #    f = tree.export_graphviz(clf, out_file=f, 
        #                 feature_names=X2.columns,  
        #                 class_names=['Both','IFC','IFE'],  
        #                 filled=True, rounded=True,  
        #                 special_characters=True)
        #dot -Tpdf xTREExTESTx.dot -o outfile.pdf
        #^ run this in command prompt

        acc.append(metrics.accuracy_score(y_test,predicted))
        fOne.append(metrics.f1_score(y_test, predicted, average='weighted'))
        confMat.append(metrics.confusion_matrix(y_test,predicted, labels=['Republican','Democratic']))
    return(acc,fOne, confMat)


## Main

In [None]:
#read data
print('reading data...')
df = ReadData()
print('done')

#test different models
modList = []
acc_all = []
fOne_all = []
conMat_all = []
for i in range(1,3): #maxGram
    for j in np.arange(0.001,0.026,0.005): #Min Doc Freq for words
        curMODstr = '\n nGram= ' + str(i) + '\n minDF= ' + str(j) + '\n NB'
        print('fitting model '+curMODstr)
        modList.append(curMODstr)
        X, y, CountMatrix_vectorizer, doc = getCountMatrixfromdocs(df, maxDF=0.8, minDF=j, maxFeat=10000, maxGram=i)
        acc, fOne, conMat = fitNBevaluate(X, y, cv_folds=10)
        acc_all.append(acc)
        fOne_all.append(fOne)
        conMat_all.append(conMat)

        curMODstr = '\n nGram= ' + str(i) + '\n minDF= ' + str(j) + '\n LOG-un'
        print('fitting model '+curMODstr)
        modList.append(curMODstr)
        acc, fOne, conMat = fitLevaluate(X, y, cv_folds=10)
        acc_all.append(acc)
        fOne_all.append(fOne)
        conMat_all.append(conMat)

        curMODstr = '\n nGram= ' + str(i) + '\n minDF= ' + str(j) + '\n LOG-bal'
        print('fitting model '+curMODstr)
        modList.append(curMODstr)
        acc2, fOne2, conMat2 = fitLevaluate(X, y, cv_folds=10, CW='balanced')
        acc_all.append(acc2)
        fOne_all.append(fOne2)
        conMat_all.append(conMat2)
            
        curMODstr ='\n nGram= ' + str(i) + '\n minDF= ' + str(j) + '\n TREE'
        print('fitting model '+curMODstr)
        modList.append(curMODstr)
        acc2, fOne2, conMat2 = fitTREEevaluate(X, y, cv_folds=10)
        acc_all.append(acc2)
        fOne_all.append(fOne2)
        conMat_all.append(conMat2)

        for k in range(5,6):
            curMODstr = '\n nGram= ' + str(i) + '\n minDF= ' + str(j) + '\n KNN'+str(k)
            print('fitting model '+curMODstr)
            modList.append(curMODstr)
            acc, fOne, conMat = fitKNNevaluate(X, y, cv_folds=10, K=k)
            acc_all.append(acc)
            fOne_all.append(fOne)
            conMat_all.append(conMat)
            
print('DONE')


reading data...
done
fitting model 
 nGram= 1
 minDF= 0.001
 NB


  'recall', 'true', average, warn_for)


fitting model 
 nGram= 1
 minDF= 0.001
 LOG-un
fitting model 
 nGram= 1
 minDF= 0.001
 LOG-bal
fitting model 
 nGram= 1
 minDF= 0.001
 TREE
fitting model 
 nGram= 1
 minDF= 0.001
 KNN5
fitting model 
 nGram= 1
 minDF= 0.006
 NB
fitting model 
 nGram= 1
 minDF= 0.006
 LOG-un
fitting model 
 nGram= 1
 minDF= 0.006
 LOG-bal
fitting model 
 nGram= 1
 minDF= 0.006
 TREE
fitting model 
 nGram= 1
 minDF= 0.006
 KNN5
fitting model 
 nGram= 1
 minDF= 0.011
 NB
fitting model 
 nGram= 1
 minDF= 0.011
 LOG-un
fitting model 
 nGram= 1
 minDF= 0.011
 LOG-bal
fitting model 
 nGram= 1
 minDF= 0.011
 TREE
fitting model 
 nGram= 1
 minDF= 0.011
 KNN5
fitting model 
 nGram= 1
 minDF= 0.016
 NB
fitting model 
 nGram= 1
 minDF= 0.016
 LOG-un
fitting model 
 nGram= 1
 minDF= 0.016
 LOG-bal
fitting model 
 nGram= 1
 minDF= 0.016
 TREE
fitting model 
 nGram= 1
 minDF= 0.016
 KNN5
fitting model 
 nGram= 1
 minDF= 0.021
 NB
fitting model 
 nGram= 1
 minDF= 0.021
 LOG-un
fitting model 
 nGram= 1
 minDF= 0.021
 L

In [278]:
modList = []
acc_all = []
fOne_all = []
conMat_all = []

curMODstr = 'L'
modList.append(curMODstr)
X, y, count_vectorizer, terms = getCountMatrixfromdocs(df, maxDF=0.8, minDF=.001, maxFeat=10000, maxGram=1)

print('fitting model '+curMODstr)
acc2, fOne2, conMat2 = fitLevaluate(X, y, cv_folds=10)
acc_all.append(acc2)
fOne_all.append(fOne2)
conMat_all.append(conMat2)

print(np.mean(acc2))

#makePlot(data=acc_all, labelz=modList)


fitting model L


  'recall', 'true', average, warn_for)


0.711012575877


In [280]:
conMat2[3]

array([[ 492,  270],
       [ 548, 2451]])

In [294]:
for i in [1,2,3,4,5]:
    if i == 3: continue
    print(i)

1
2
4
5
