In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

Reading original file

In [2]:
df2 = pd.read_excel('training-Obama-Romney-tweets.xlsx', sheetname = 'Romney')
inputFrame = df2
inputFrame.dropna(inplace = True)
print('Total number of tweets:'+str(len(inputFrame)))
print('top few tweets:')
inputFrame.head()

Total number of tweets:6571
top few tweets:


Unnamed: 0,date,time,Anootated tweet,Class
0,10/14/2012,PM 11:11:49,Id rather <a>vote</a> for <e>Romney</e> than ...,1
1,10/15/2012,PM 3:19:15,Gallup shows<e> Romney</e><a> pulling ahead <...,1
2,10/14/2012,AM 7:29:37,"By record-high margin, <a>debate</a> watchers...",1
3,2012-10-16 00:00:00,0.420463,<e>Romney</e> will make 'great' <a>president</...,1
4,10/15/2012,PM 2:34:27,"@GOTV2012 ""Mitt <e>Romney</e> might just <a>s...",1


In [3]:
inputFrame_test = inputFrame.copy()
inputFrame_test['Class'] = inputFrame_test['Class'].astype(str) 
inputFrame1 = inputFrame_test[inputFrame_test.Class == '1']
inputFrame2 = inputFrame_test[inputFrame_test.Class == '-1']
inputFrame3 = inputFrame_test[inputFrame_test.Class == '0']
inputFrame = pd.concat([inputFrame1, inputFrame2, inputFrame3])
tweetProcessFrame = inputFrame
print(len(inputFrame1))
print(len(inputFrame2))
print(len(inputFrame3))

2000
2892
1679


Preprocessing (cleaning the tweets)

In [4]:
# Cleaning tweets
pwords,words=None, None

with open("positive-words.txt") as f:
    pwords = [el.strip() for el in f.readlines()]
with open("negative-words.txt") as f:
    nwords = [el.strip() for el in f.readlines()]
emoticons_dict = {}
def populateEmoticonsDict():
    fileHandler = open('EmoticonsWithPolarity.txt', 'r')
    for line in fileHandler:
        emoticon_list = line[:-1].split(' ')
        sentiment = emoticon_list[-1]
        emoticon_list = emoticon_list[:-1]
        for emoticon in emoticon_list:
            emoticons_dict[emoticon] = sentiment
populateEmoticonsDict()

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def replace(tweet,words,replacement):
    li = []
    for word in tweet:
        if word in words:
            li.append(replacement)
        else:
            li.append(word)
    return li

def processTweet(tweet):
    emotionFreeTweet = mapEmoticons(tweet)
    tagFreeTweet = removeTags(emotionFreeTweet)
    lowerCaseTweet = tagFreeTweet.lower()
    ptweet = replace(lowerCaseTweet.split(' '),pwords,'positive')
    ntweet = replace(ptweet,nwords,'negative')
    t = ' '.join(ntweet)
    puncFreeTweet = removePunctuations(t)
    stopFreeTweet = removeStopWords(puncFreeTweet, stop_words)
    stemmedTweet = stemWords(stopFreeTweet)
    return stemmedTweet

def stemWords(tweet):
    return [stemmer.stem(t) for t in tweet]

def mapEmoticons(tweet):
    list_words = tweet.split(' ')
    newtweet = ""
    for word in list_words:
        if word in emoticons_dict:
            newtweet = newtweet + ' ' + emoticons_dict.get(word) 
        else:
            newtweet = newtweet + ' ' + word
    return newtweet
    

def removeTags(tweet):
    cleanr = re.compile('(</?[a-zA-Z]+>|https?:\/\/[^\s]*|(^|\s)RT(\s|$)|@[^\s]+|\d+)')
    cleantext = re.sub(cleanr, ' weblink ', tweet)
    cleantext = re.sub('(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)',' usermention ',cleantext)
    cleantext = re.sub('[^\sa-zA-Z]+','',cleantext)
    cleantext = re.sub('\s+',' ',cleantext)
    return cleantext

def removePunctuations(tweet):
    exclude = set(string.punctuation) 
    t = ''
    for ch in tweet:
        if ch not in exclude:
            t+=ch
        else:
            t+=' '
    return tweet

def removeStopWords(tweet, stop_words):
    words_tokenize = word_tokenize(tweet)
    filtered_sentence = words_tokenize #[w for w in words_tokenize if w not in stop_words]
    return filtered_sentence #' '.join(filtered_sentence)

In [5]:
tweetProcessFrame.rename(columns={'date':'date','time':'time','Anootated tweet' : 'tweet','Class':'Class'},inplace = True)
tweetProcessFrame['tweet'] = tweetProcessFrame['tweet'].apply(processTweet)
del tweetProcessFrame['date']
del tweetProcessFrame['time']

def joinList1(tweetList):
    return " ".join(tweetList)

tweetProcessFrame['tweet'] = tweetProcessFrame['tweet'].apply(joinList1)
tweetProcessFrame1 = tweetProcessFrame
print('tweets after preprocessing')
tweetProcessFrame.head()

tweets after preprocessing


Unnamed: 0,tweet,Class
0,negat id rather weblink vote weblink for webli...,1
1,negat gallup show weblink romney weblink webli...,1
2,negat by recordhigh margin weblink debat webli...,1
3,weblink romney weblink will make posit weblink...,1
4,negat weblink mitt weblink romney weblink migh...,1


spliting training and testing for 80-20 testing

In [6]:
#train, test = train_test_split(tweetProcessFrame, test_size = 0.2)
def splitTrainingData(df, train_data_prcnt=80):
    msk = np.random.rand(len(df)) < train_data_prcnt/100
    train = df[msk]
    test = df[~msk]
    return train, test
tweet_random_df = tweetProcessFrame1.copy()
for i in range(0, 50):
    split1_df, split2_df = splitTrainingData(tweet_random_df)
    tweet_random_df = pd.concat([split1_df, split2_df])
train, test = splitTrainingData(tweet_random_df)

In [7]:
train_data = train['tweet']
train_label = train['Class']
train_label = pd.to_numeric(train_label)

test_data = test['tweet']
test_class = test['Class']
test_class = pd.to_numeric(test_class)

In [8]:
count_vect = CountVectorizer(max_features = 1500)
X_train_counts = count_vect.fit_transform(train_data)
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

Training Multinomial Naive Bayes

In [9]:
#Multinimoial NB
# mnbd = {   
#             'alpha':[(float)(el/1000) for el in range(0,100,1)], 
#             'fit_prior':(True,False)
#         }

# clf = GridSearchCV(MultinomialNB(),mnbd).fit(X_train_tf, train_label)

# {'alpha': 0.014, 'fit_prior': True}
clf = MultinomialNB(alpha= 0.014, fit_prior= True).fit(X_train_tf, train_label)

X_test_counts = count_vect.transform(test_data)
X_test_tfidf = tf_transformer.transform(X_test_counts)
predicted = clf.predict(X_test_tfidf)

print('Test Accuracy:'+str(np.mean(predicted == test_class)))

print(metrics.classification_report(test_class, predicted))

print('Confusion Matrix:')
nb_confusion_matrix = metrics.confusion_matrix(test_class, predicted)
print(nb_confusion_matrix)

predicted_train = clf.predict(X_train_tf)
print('Train Accuracy:'+str(np.mean(predicted_train == train_label)))

Test Accuracy:0.602201257862
             precision    recall  f1-score   support

         -1       0.60      0.80      0.69       594
          0       0.48      0.22      0.30       324
          1       0.66      0.62      0.64       354

avg / total       0.59      0.60      0.58      1272

Confusion Matrix:
[[475  62  57]
 [196  71  57]
 [118  16 220]]
Train Accuracy:0.722589167768


Support Vector Machine (Linear SVC)

In [10]:
# from sklearn.svm import LinearSVC
# text_clf_svm = Pipeline([('vect', CountVectorizer(max_features=500)),
#                     ('tfidf', TfidfTransformer()),
#                     ('clf', LinearSVC(C = 0.5, dual=False, random_state= 42, max_iter=10000))])
# text_clf_svm = text_clf_svm.fit(train_data, train_label)

# parameterssvm = {'vect__ngram_range': [(1, 1), (1, 2)],
#               'vect__max_features': list(range(500,5500,100)),
#               'tfidf__use_idf': (True, False),
#               'clf__loss' : ['hinge', 'squared_hinge'],
#               'clf__penalty' : ['l2'],
#               'clf__C' : [i/10 for i in range(5,9)],
#               'clf__multi_class' : ['crammer_singer','ovr']
#              }
text_clf_svm = Pipeline([('vect', CountVectorizer(max_features = 1500, ngram_range= (1, 2))),
                    ('tfidf', TfidfTransformer(use_idf= True)),
                    ('clf', LinearSVC(C = 0.5, loss = 'hinge', penalty = 'l2', random_state= 42, max_iter=10000))]).fit(train_data, train_label)
# text_clf_svm = text_clf_svm.fit(train_data,train_label)
# text_clf_svm = GridSearchCV(text_clf_svm,parameterssvm,n_jobs=-1).fit(train_data, train_label)
# text_clf_svm.best_params_

predicted_svm = text_clf_svm.predict(test_data)
print('Test Accuracy:'+str(np.mean(predicted_svm == test_class)))
print(metrics.classification_report(test_class, predicted_svm))

print('Confusion Matrix:')
linearSVM_confusion_matrix = metrics.confusion_matrix(test_class, predicted_svm)
print(linearSVM_confusion_matrix)

predicted_svm_train = text_clf_svm.predict(train_data)
print('Train Accuracy:'+str(np.mean(predicted_svm_train == train_label)))

Test Accuracy:0.613993710692
             precision    recall  f1-score   support

         -1       0.64      0.77      0.70       594
          0       0.50      0.17      0.25       324
          1       0.60      0.76      0.67       354

avg / total       0.59      0.61      0.58      1272

Confusion Matrix:
[[459  43  92]
 [180  54  90]
 [ 74  12 268]]
Train Accuracy:0.728628043027


Voting classifier

In [11]:
clf1 = MultinomialNB(alpha= 0.094, fit_prior= True)
clf2 = SGDClassifier(alpha=0.001,learning_rate='optimal',loss= 'epsilon_insensitive', penalty= 'l2',n_iter = 100, random_state=42)
clf3 = LinearSVC(C = 0.5, loss = 'hinge', random_state= 42)
clf4 = RandomForestClassifier(n_estimators = 22, class_weight = 'balanced_subsample', random_state = 42,criterion="gini")

eclf = Pipeline([('vect', CountVectorizer(max_features=1500,ngram_range=(1,2))),
                    ('tfidf', TfidfTransformer(use_idf= True)),
                    ('clf', VotingClassifier(estimators=[('mnb', clf1), ('sgd', clf2), ('svm', clf3), ('rf',clf4)], voting='hard'))])

eclf = eclf.fit(train_data,train_label)

p = eclf.predict(test_data)
print('Test Accuracy:'+str(np.mean(p==test_class)))
print(metrics.classification_report(test_class, p))

print('Confusion Matrix:')
voting_confusion_matrix = metrics.confusion_matrix(test_class,p)
print(voting_confusion_matrix)

predicted_eclf_train = eclf.predict(train_data)
print('Train Accuracy:'+str(np.mean(predicted_eclf_train == train_label)))

Test Accuracy:0.629716981132
             precision    recall  f1-score   support

         -1       0.63      0.85      0.72       594
          0       0.51      0.13      0.20       324
          1       0.66      0.71      0.69       354

avg / total       0.61      0.63      0.58      1272

Confusion Matrix:
[[507  29  58]
 [212  41  71]
 [ 91  10 253]]
Train Accuracy:0.742215512361


Preparing Data for Cross Validation

In [12]:
combineFrameData = [train_data, test_data]
combineFrameLabel = [train_label, test_class]
combineTrainDataDF = pd.concat(combineFrameData)
combineTrainLabelDF = pd.concat(combineFrameLabel)
count_vect_kfold = CountVectorizer(max_features = 1500)
X_train_counts_kfold = count_vect.fit_transform(combineTrainDataDF)
tf_transformer_kfold = TfidfTransformer(use_idf=True).fit(X_train_counts_kfold)
X_train_tf_kfold = tf_transformer_kfold.transform(X_train_counts_kfold)

Cross Validation accuracy Naive Bayes

In [13]:
clf_nb_kfold = MultinomialNB(alpha = 0.014, fit_prior = True)
scores = cross_val_score(clf_nb_kfold, X_train_tf_kfold, combineTrainLabelDF, cv=10)
clf_nb_kfold = clf_nb_kfold.fit(X_train_tf_kfold, combineTrainLabelDF)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print(scores)

Accuracy: 0.62 (+/- 0.02)
[ 0.6231003   0.6231003   0.60578387  0.61339422  0.62557078  0.61796043
  0.63318113  0.61187215  0.6194825   0.58841463]


 Cross Validation Accuracy Linear SVM

In [14]:
clf_lsvm_kfold = LinearSVC(C = 0.5, loss = 'hinge', penalty='l2', random_state= 42, max_iter=10000)
scores_lsvm = cross_val_score(clf_lsvm_kfold, X_train_tf_kfold, combineTrainLabelDF, cv=10)
clf_lsvm_kfold = clf_lsvm_kfold.fit(X_train_tf_kfold, combineTrainLabelDF)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_lsvm.mean(), scores_lsvm.std() * 2))
print(scores_lsvm)

Accuracy: 0.63 (+/- 0.04)
[ 0.62006079  0.63069909  0.63165906  0.65144597  0.63622527  0.65144597
  0.65144597  0.63774734  0.59208524  0.61890244]


In [15]:
df_test = pd.read_excel('testing-Obama-Romney-tweets.xlsx', sheetname = 'Romney', header=None)
df_test.columns = ['tweet','Class']
df_test['Class'] = df_test['Class'].astype(str) 
inputFrame1 = df_test[df_test.Class == '1']
inputFrame2 = df_test[df_test.Class == '-1']
inputFrame3 = df_test[df_test.Class == '0']
df_test = pd.concat([inputFrame1, inputFrame2, inputFrame3])
df_test['tweet'] = df_test['tweet'].apply(processTweet)
df_test['tweet'] = df_test['tweet'].apply(joinList1)

testcounts = count_vect.transform(df_test['tweet'])
test_tfidf = tf_transformer_kfold.transform(testcounts)

df_test.head()

Unnamed: 0,tweet,Class
1,weblink romney weblink got weblink less minut ...,1
18,weblink mitt negat weblink is beat him up nega...,1
31,i actual posit negat weblink romney weblink s ...,1
32,just for that weblink immigr statement weblink...,1
60,this man negat weblink romney negat weblink is...,1


In [16]:
p = clf_nb_kfold.predict(test_tfidf)
xy = pd.to_numeric(df_test['Class'])
print('Test Accuracy:'+str(np.mean(p == xy)))

print(metrics.classification_report(xy, p))

print('Confusion Matrix:')
nb_confusion_matrix = metrics.confusion_matrix(xy, p)
print(nb_confusion_matrix)

Test Accuracy:0.575789473684
             precision    recall  f1-score   support

         -1       0.60      0.83      0.70       960
          0       0.53      0.23      0.32       555
          1       0.50      0.43      0.46       385

avg / total       0.56      0.58      0.54      1900

Confusion Matrix:
[[801  79  80]
 [343 128  84]
 [187  33 165]]


In [17]:
p = clf_lsvm_kfold.predict(test_tfidf)
xy = pd.to_numeric(df_test['Class'])
print('Test Accuracy:'+str(np.mean(p == xy)))

print(metrics.classification_report(xy, p))

print('Confusion Matrix:')
nb_confusion_matrix = metrics.confusion_matrix(xy, p)
print(nb_confusion_matrix)

Test Accuracy:0.597894736842
             precision    recall  f1-score   support

         -1       0.63      0.83      0.72       960
          0       0.64      0.19      0.30       555
          1       0.48      0.60      0.54       385

avg / total       0.61      0.60      0.56      1900

Confusion Matrix:
[[798  40 122]
 [324 107 124]
 [135  19 231]]


In [18]:
p = eclf.predict(df_test['tweet'])
xy = pd.to_numeric(df_test['Class'])
print('Test Accuracy:'+str(np.mean(p == xy)))

print(metrics.classification_report(xy, p))

print('Confusion Matrix:')
nb_confusion_matrix = metrics.confusion_matrix(xy, p)
print(nb_confusion_matrix)

Test Accuracy:0.609473684211
             precision    recall  f1-score   support

         -1       0.62      0.87      0.73       960
          0       0.68      0.20      0.30       555
          1       0.54      0.55      0.54       385

avg / total       0.62      0.61      0.57      1900

Confusion Matrix:
[[836  38  86]
 [348 109  98]
 [159  13 213]]
