In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

Reading original file

In [2]:
df1 = pd.read_excel('training-Obama-Romney-tweets.xlsx', sheetname = 'Obama')
inputFrame = df1
inputFrame.dropna(inplace = True)
print('Total number of tweets:'+str(len(inputFrame)))
print('top few tweets:')
inputFrame.head()

Total number of tweets:7182
top few tweets:


Unnamed: 0,date,time,Anootated tweet,Class
0,2012-10-16 00:00:00,10:28:53-05:00,"Kirkpatrick, who wore a baseball cap embroider...",0
1,2016-12-10 00:00:00,10:09:00-05:00,Question: If <e>Romney</e> and <e>Obama</e> ha...,2
2,2012-10-16 00:00:00,10:04:30-05:00,#<e>obama</e> debates that Cracker Ass Cracker...,1
3,2012-10-16 00:00:00,10:00:36-05:00,RT @davewiner Slate: Blame <e>Obama</e> for fo...,2
4,2012-10-16 00:00:00,09:50:08-05:00,@Hollivan @hereistheanswer Youre missing the ...,0


Preprocessing (cleaning the tweets)

In [3]:
inputFrame_test = inputFrame.copy()
inputFrame_test['Class'] = inputFrame_test['Class'].astype(str) 
inputFrame1 = inputFrame_test[inputFrame_test.Class == '1']
inputFrame2 = inputFrame_test[inputFrame_test.Class == '-1']
inputFrame3 = inputFrame_test[inputFrame_test.Class == '0']
inputFrame = pd.concat([inputFrame1, inputFrame2, inputFrame3])
tweetProcessFrame = inputFrame.copy()

In [4]:
# Cleaning tweets

pwords,words=None, None

with open("positive-words.txt") as f:
    pwords = [el.strip() for el in f.readlines()]
with open("negative-words.txt") as f:
    nwords = [el.strip() for el in f.readlines()]

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

emoticons_dict = {}
def populateEmoticonsDict():
    fileHandler = open('EmoticonsWithPolarity.txt', 'r')
    for line in fileHandler:
        emoticon_list = line[:-1].split(' ')
        sentiment = emoticon_list[-1]
        emoticon_list = emoticon_list[:-1]
        for emoticon in emoticon_list:
            emoticons_dict[emoticon] = sentiment
populateEmoticonsDict()

def replace(tweet,words,replacement):
    li = []
    for word in tweet:
        if word in words:
            li.append(replacement)
        else:
            li.append(word)
    return li

def processTweet(tweet):
    emotionFreeTweet = mapEmoticons(tweet)
    tagFreeTweet = removeTags(emotionFreeTweet)
    lowerCaseTweet = tagFreeTweet.lower()
    ptweet = replace(lowerCaseTweet.split(' '),pwords,'positive')
    ntweet = replace(ptweet,nwords,'negative')
    t = ' '.join(ntweet)
    puncFreeTweet = removePunctuations(t)
    stopFreeTweet = removeStopWords(puncFreeTweet, stop_words)
    stemmedTweet = stemWords(stopFreeTweet)
    return stemmedTweet

def stemWords(tweet):
    return [stemmer.stem(t) for t in tweet]

def mapEmoticons(tweet):
    list_words = tweet.split(' ')
    newtweet = ""
    for word in list_words:
        if word in emoticons_dict:
            newtweet = newtweet + ' ' + emoticons_dict.get(word) 
        else:
            newtweet = newtweet + ' ' + word
    return newtweet
    

def removeTags(tweet):
    cleanr = re.compile('(</?[a-zA-Z]+>|https?:\/\/[^\s]*|(^|\s)RT(\s|$)|@[^\s]+|\d+)')
    cleantext = re.sub(cleanr, ' weblink ', tweet)
    cleantext = re.sub('(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)',' usermention ',cleantext)
    cleantext = re.sub('[^\sa-zA-Z]+','',cleantext)
    cleantext = re.sub('\s+',' ',cleantext)
    return cleantext

def removePunctuations(tweet):
    exclude = set(string.punctuation) 
    t = ''
    for ch in tweet:
        if ch not in exclude:
            t+=ch
        else:
            t+=' '
    return tweet

def removeStopWords(tweet, stop_words):
    words_tokenize = word_tokenize(tweet)
    filtered_sentence = words_tokenize #[w for w in words_tokenize if w not in stop_words]
    return filtered_sentence #' '.join(filtered_sentence)

In [5]:
tweetProcessFrame.rename(columns={'date':'date','time':'time','Anootated tweet' : 'tweet','Class':'Class'},inplace = True)
tweetProcessFrame['tweet'] = tweetProcessFrame['tweet'].apply(processTweet)
del tweetProcessFrame['time']
del tweetProcessFrame['date']
def joinList1(tweetList):
    return " ".join(tweetList)

tweetProcessFrame['tweet'] = tweetProcessFrame['tweet'].apply(joinList1)
tweetProcessFrame1 = pd.DataFrame.drop_duplicates(tweetProcessFrame)
print('tweets after preprocessing')
tweetProcessFrame.head()

tweets after preprocessing


Unnamed: 0,tweet,Class
2,weblink obama weblink debat that cracker ass c...,1
25,still my posit mr presid weblink obama weblink,1
29,weblink you said so much posit thing about web...,1
32,weblink im a south african and i say weblink o...,1
44,okay weblink obama weblink it is time to put y...,1


spliting training and testing for 80-20 testing

In [6]:
#train, test = train_test_split(tweetProcessFrame, test_size = 0.2)
def splitTrainingData(df, train_data_prcnt=80):
    msk = np.random.rand(len(df)) < train_data_prcnt/100
    train = df[msk]
    test = df[~msk]
    return train, test
tweet_random_df = tweetProcessFrame1.copy()
for i in range(0, 50):
    split1_df, split2_df = splitTrainingData(tweet_random_df)
    tweet_random_df = pd.concat([split1_df, split2_df])
train, test = splitTrainingData(tweet_random_df)

In [7]:
train_data = train['tweet']
train_label = train['Class']
train_label = pd.to_numeric(train_label)

test_data = test['tweet']
test_class = test['Class']
test_class = pd.to_numeric(test_class)

In [8]:
count_vect = CountVectorizer(max_features = 4800, ngram_range=(1, 2))
X_train_counts = count_vect.fit_transform(train_data)

tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

Training Multinomial Naive Bayes

In [9]:
# mnbd = {   
#             'alpha':[(float)(el/1000) for el in range(0,100,1)], 
#             'fit_prior':(True,False)
#         }

clf = MultinomialNB(alpha= 0.99, fit_prior= True).fit(X_train_tf, train_label)
#GridSearchCV(MultinomialNB(),mnbd).fit(X_train_tf, train_label)
#clf.best_params_
# clf = MultinomialNB(alpha= 0.99, fit_prior= True).fit(X_train_tf, train_label)
X_test_counts = count_vect.transform(test_data)
X_test_tfidf = tf_transformer.transform(X_test_counts)
predicted = clf.predict(X_test_tfidf)

print('Test Accuracy:'+str(np.mean(predicted == test_class)))

print(metrics.classification_report(test_class, predicted))

print('Confusion Matrix:')
nb_confusion_matrix = metrics.confusion_matrix(test_class, predicted)
print(nb_confusion_matrix)

predicted_train = clf.predict(X_train_tf)
print('Train Accuracy:'+str(np.mean(predicted_train == train_label)))

Test Accuracy:0.616246498599
             precision    recall  f1-score   support

         -1       0.63      0.70      0.67       402
          0       0.58      0.57      0.57       381
          1       0.65      0.56      0.60       288

avg / total       0.62      0.62      0.61      1071

Confusion Matrix:
[[282  88  32]
 [108 217  56]
 [ 55  72 161]]
Train Accuracy:0.762893503014


Support Vector Machine (Linear SVC)

In [10]:
# parameterssvm = {'vect__ngram_range': [(1, 1), (1, 2)],
#               'vect__max_features': list(range(500,5500,100)),
#               'tfidf__use_idf': (True, False),
#               'clf__loss' : ['hinge', 'squared_hinge'],
#               'clf__penalty' : ['l2'],
#               'clf__C' : [i/10 for i in range(5,9)],
#               'clf__multi_class' : ['crammer_singer','ovr']
#              }

# text_clf_svm = Pipeline([('vect', CountVectorizer()),
#                     ('tfidf', TfidfTransformer()),
#                     ('clf', LinearSVC(random_state= 42, max_iter=10000))])
# text_clf_svm = text_clf_svm.fit(train_data,train_label)
# text_clf_svm = GridSearchCV(text_clf_svm,parameterssvm,n_jobs=-1).fit(train_data, train_label)
# text_clf_svm.best_params_

# {'clf__C': 0.5,
#  'clf__loss': 'hinge',
#  'clf__multi_class': 'ovr',
#  'clf__penalty': 'l2',
#  'tfidf__use_idf': True,
#  'vect__max_features': 4800,
#  'vect__ngram_range': (1, 2)}

text_clf_svm = Pipeline([('vect', CountVectorizer(max_features=4800,ngram_range=(1,2))),
                    ('tfidf', TfidfTransformer(use_idf=True)),
                    ('clf', LinearSVC(C=0.5,loss='hinge',multi_class='ovr',penalty='l2',random_state= 42, max_iter=10000))]).fit(train_data, train_label)

predicted_svm = text_clf_svm.predict(test_data)
print('Test Accuracy:'+str(np.mean(predicted_svm == test_class)))
print(metrics.classification_report(test_class, predicted_svm))

print('Confusion Matrix:')
linearSVM_confusion_matrix = metrics.confusion_matrix(test_class, predicted_svm)
print(linearSVM_confusion_matrix)

predicted_svm_train = text_clf_svm.predict(train_data)
print('Train Accuracy:'+str(np.mean(predicted_svm_train == train_label)))

Test Accuracy:0.629318394024
             precision    recall  f1-score   support

         -1       0.63      0.73      0.68       402
          0       0.61      0.54      0.58       381
          1       0.64      0.60      0.62       288

avg / total       0.63      0.63      0.63      1071

Confusion Matrix:
[[295  74  33]
 [111 206  64]
 [ 60  55 173]]
Train Accuracy:0.789461933467


Voting classifier

In [11]:
clf1 = MultinomialNB(alpha= 0.094, fit_prior= True)
clf2 = SGDClassifier(alpha=0.001,learning_rate='optimal',loss= 'epsilon_insensitive', penalty= 'l2',n_iter = 100, random_state=42)
clf3 = LinearSVC(C = 0.5, loss = 'hinge', random_state= 42)
clf4 = RandomForestClassifier(n_estimators = 22, class_weight = 'balanced_subsample', random_state = 42,criterion="gini")

eclf = Pipeline([('vect', CountVectorizer(max_features=4800,ngram_range=(1,2))),
                    ('tfidf', TfidfTransformer(use_idf= True)),
                    ('clf', VotingClassifier(estimators=[('mnb', clf1), ('sgd', clf2), ('svm', clf3), ('rf',clf4)], voting='hard'))])

eclf = eclf.fit(train_data,train_label)

p = eclf.predict(test_data)
print('Test Accuracy:'+str(np.mean(p==test_class)))
print(metrics.classification_report(test_class, p))

print('Confusion Matrix:')
voting_confusion_matrix = metrics.confusion_matrix(test_class,p)
print(voting_confusion_matrix)

predicted_eclf_train = eclf.predict(train_data)
print('Train Accuracy:'+str(np.mean(predicted_eclf_train == train_label)))

Test Accuracy:0.621848739496
             precision    recall  f1-score   support

         -1       0.61      0.76      0.68       402
          0       0.62      0.52      0.57       381
          1       0.64      0.56      0.60       288

avg / total       0.62      0.62      0.62      1071

Confusion Matrix:
[[305  65  32]
 [124 199  58]
 [ 69  57 162]]
Train Accuracy:0.818932797499


Other approaces tried

SGD Classifier

In [12]:
# parameterssgd = {'vect__ngram_range': [(1, 1), (1, 2)],
#               'vect__max_features': list(range(100,5000,500)),
#               'tfidf__use_idf': (True, False),
#               'clf__loss' : ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 
#                              'epsilon_insensitive', 'squared_epsilon_insensitive'],
#               'clf__penalty' : ['l2', 'l1', 'elasticnet'],
#               'clf__learning_rate' : ['optimal','invscaling'],
#               'clf__alpha': [i/1000 for i in range(1,10)]
#              }
# text_clf = Pipeline([('vect', CountVectorizer()),
#                     ('tfidf', TfidfTransformer()),
#                     ('clf', SGDClassifier(n_iter = 100, eta0 = 0.0001, random_state=42))])
# text_clf = GridSearchCV(text_clf,parameterssgd,n_jobs=-1).fit(train_data, train_label)
# text_clf.best_params_

text_clf_sgd = Pipeline([('vect', CountVectorizer(max_features=5000,ngram_range=(1,2))),
                    ('tfidf', TfidfTransformer(use_idf= True)),
                    ('clf', SGDClassifier(alpha=0.001,learning_rate='optimal',loss= 'epsilon_insensitive'
                                          ,penalty= 'l2',n_iter = 100, random_state=42))]).fit(train_data, train_label)

predicted_sgd = text_clf_sgd.predict(test_data)
print('Test Accuracy:'+str(np.mean(predicted_sgd == test_class)))
print(metrics.classification_report(test_class, predicted_sgd))

print('Confusion Matrix:')
linearSGD_confusion_matrix = metrics.confusion_matrix(test_class, predicted_sgd)
print(linearSGD_confusion_matrix)

predicted_sgd_train = text_clf_sgd.predict(train_data)
print('Train Accuracy:'+str(np.mean(predicted_sgd_train == train_label)))

Test Accuracy:0.615312791783
             precision    recall  f1-score   support

         -1       0.60      0.74      0.66       402
          0       0.63      0.50      0.56       381
          1       0.62      0.59      0.61       288

avg / total       0.62      0.62      0.61      1071

Confusion Matrix:
[[296  65  41]
 [126 192  63]
 [ 69  48 171]]
Train Accuracy:0.753962938156


Random Forest

In [13]:
# parametersrf = {'vect__ngram_range': [(1, 1), (1, 2)],
#               'vect__max_features': list(range(500,5000,100)),
#               'tfidf__use_idf': (True, False),
#               'clf__criterion' : ['gini','entropy'],
#                 'clf__n_estimators' : list(range(2,25,10)),
#                 'clf__class_weight' : ['balanced_subsample']
#                 }

# text_clf_RandomForest = Pipeline([('vect', CountVectorizer()),
#                     ('tfidf', TfidfTransformer()),
#                     ('clf', RandomForestClassifier(random_state = 42))])
# text_clf_RandomForest = GridSearchCV(text_clf_RandomForest,parametersrf,n_jobs=-1).fit(train_data, train_label)
# text_clf_RandomForest.best_params_

text_clf_RandomForest = Pipeline([('vect', CountVectorizer(max_features = 4700,ngram_range=(1,2))),
                    ('tfidf', TfidfTransformer(use_idf=True)),
                    ('clf', RandomForestClassifier(random_state = 42,criterion="gini",class_weight='balanced_subsample'
                                                   ,n_estimators=22))])
text_clf_RandomForest = text_clf_RandomForest.fit(train_data, train_label)
# {'clf__class_weight': 'balanced_subsample',
#  'clf__criterion': 'gini',
#  'clf__n_estimators': 22,
#  'tfidf__use_idf': True,
#  'vect__max_features': 4700,
#  'vect__ngram_range': (1, 2)}
predicted_rf = text_clf_RandomForest.predict(test_data)
print('Test Accuracy:'+str(np.mean(predicted_rf == test_class)))
print(metrics.classification_report(test_class, predicted_rf))

print('Confusion Matrix:')
linearRF_confusion_matrix = metrics.confusion_matrix(test_class, predicted_rf)
print(linearRF_confusion_matrix)

predicted_rf_train = text_clf_RandomForest.predict(train_data)
print('Train Accuracy:'+str(np.mean(predicted_rf_train == train_label)))

Test Accuracy:0.567693744164
             precision    recall  f1-score   support

         -1       0.57      0.63      0.60       402
          0       0.56      0.56      0.56       381
          1       0.57      0.49      0.53       288

avg / total       0.57      0.57      0.57      1071

Confusion Matrix:
[[253  97  52]
 [113 214  54]
 [ 77  70 141]]
Train Accuracy:0.995088189328


Preparing Data for Cross Validation

In [14]:
combineFrameData = [train_data, test_data]
combineFrameLabel = [train_label, test_class]
combineTrainDataDF = pd.concat(combineFrameData)
combineTrainLabelDF = pd.concat(combineFrameLabel)
count_vect_kfold = CountVectorizer(max_features = 4800)
X_train_counts_kfold = count_vect.fit_transform(combineTrainDataDF)
tf_transformer_kfold = TfidfTransformer(use_idf=True).fit(X_train_counts_kfold)
X_train_tf_kfold = tf_transformer_kfold.transform(X_train_counts_kfold)

Cross Validation accuracy Naive Bayes

In [15]:
clf_nb_kfold = MultinomialNB(alpha = 0.099, fit_prior = True)
scores = cross_val_score(clf_nb_kfold, X_train_tf_kfold, combineTrainLabelDF, cv=10)
clf_nb_kfold = clf_nb_kfold.fit(X_train_tf_kfold, combineTrainLabelDF)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print(scores)

Accuracy: 0.58 (+/- 0.03)
[ 0.58992806  0.5647482   0.58992806  0.59459459  0.5963964   0.58378378
  0.56936937  0.55495495  0.59386282  0.59312839]


Cross Validation Accuracy Linear SVM

In [16]:
clf_lsvm_kfold = LinearSVC(C = 0.5, loss = 'hinge', penalty='l2', random_state= 42, max_iter=10000)
scores_lsvm = cross_val_score(clf_lsvm_kfold, X_train_tf_kfold, combineTrainLabelDF, cv=10)
clf_lsvm_kfold = clf_lsvm_kfold.fit(X_train_tf_kfold, combineTrainLabelDF)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_lsvm.mean(), scores_lsvm.std() * 2))
print(scores_lsvm)

Accuracy: 0.61 (+/- 0.05)
[ 0.63489209  0.55215827  0.63669065  0.61441441  0.58018018  0.61621622
  0.58558559  0.59099099  0.62274368  0.6238698 ]


Cross Validation SGD

In [17]:
clf_sgd_kfold = SGDClassifier(alpha=0.001,learning_rate='optimal',loss= 'epsilon_insensitive', penalty= 'l2',n_iter = 100, random_state=42)
scores_sgd = cross_val_score(clf_sgd_kfold, X_train_tf_kfold, combineTrainLabelDF, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_sgd.mean(), scores_sgd.std() * 2))
print(scores_sgd)

Accuracy: 0.60 (+/- 0.05)
[ 0.63309353  0.55755396  0.64388489  0.60540541  0.58918919  0.61441441
  0.6036036   0.56756757  0.61552347  0.60759494]


Cross Validation accuracy Random Forest

In [18]:
clf_randomForest_kfold = RandomForestClassifier(n_estimators = 22, class_weight = 'balanced_subsample', random_state = 42, criterion="gini")
scores_randomForest = cross_val_score(clf_randomForest_kfold, X_train_tf_kfold, combineTrainLabelDF, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_randomForest.mean(), scores_randomForest.std() * 2))
print(scores_randomForest)

Accuracy: 0.57 (+/- 0.05)
[ 0.59352518  0.52877698  0.59532374  0.56756757  0.53333333  0.58378378
  0.57837838  0.54234234  0.56498195  0.56238698]


In [19]:
# import pickle as pickle
# with open('nb_dumped_classifier.pkl', 'wb') as fid:
#     pickle.dump(clf, fid)  
# with open('svm_dumped_classifier.pkl', 'wb') as fid:
#     pickle.dump(text_clf_svm, fid) 
# with open('voting_dumped_classifier.pkl', 'wb') as fid:
#     pickle.dump(eclf, fid) 
# # load a classifier
# with open('svm_dumped_classifier_romney.pkl', 'rb') as fid:
#     gnb_loaded = pickle.load(fid)

In [20]:
df_test = pd.read_excel('testing-Obama-Romney-tweets.xlsx', sheetname = 'Obama')#, header=None)
df_test.columns = ['tweet','Class']
df_test['Class'] = df_test['Class'].astype(str) 
inputFrame1 = df_test[df_test.Class == '1']
inputFrame2 = df_test[df_test.Class == '-1']
inputFrame3 = df_test[df_test.Class == '0']
df_test = pd.concat([inputFrame1, inputFrame2, inputFrame3])
df_test['tweet'] = df_test['tweet'].apply(processTweet)
df_test['tweet'] = df_test['tweet'].apply(joinList1)

testcounts = count_vect.transform(df_test['tweet'])
test_tfidf = tf_transformer_kfold.transform(testcounts)

df_test.head()

Unnamed: 0,tweet,Class
2,weblink obama weblink has to maintain his prof...,1
11,weblink obama weblink went into the debat swin...,1
21,ditto i start weblink weblink year ago weblink...,1
36,i absolut posit weblink obama weblink s view i...,1
43,im agre complet with weblink obama weblink s s...,1


In [21]:
p = clf_nb_kfold.predict(test_tfidf)
xy = pd.to_numeric(df_test['Class'])
print('Test Accuracy:'+str(np.mean(p == xy)))

print(metrics.classification_report(xy, p))

print('Confusion Matrix:')
nb_confusion_matrix = metrics.confusion_matrix(xy, p)
print(nb_confusion_matrix)

Test Accuracy:0.556637621732
             precision    recall  f1-score   support

         -1       0.55      0.60      0.58       688
          0       0.53      0.54      0.53       681
          1       0.60      0.53      0.56       582

avg / total       0.56      0.56      0.56      1951

Confusion Matrix:
[[411 187  90]
 [199 368 114]
 [131 144 307]]


In [22]:
p = clf_lsvm_kfold.predict(test_tfidf)
xy = pd.to_numeric(df_test['Class'])
print('Test Accuracy:'+str(np.mean(p == xy)))

print(metrics.classification_report(xy, p))

print('Confusion Matrix:')
nb_confusion_matrix = metrics.confusion_matrix(xy, p)
print(nb_confusion_matrix)

Test Accuracy:0.59354177345
             precision    recall  f1-score   support

         -1       0.58      0.67      0.62       688
          0       0.58      0.56      0.57       681
          1       0.62      0.55      0.58       582

avg / total       0.59      0.59      0.59      1951

Confusion Matrix:
[[460 139  89]
 [196 380 105]
 [133 131 318]]


In [23]:
p = eclf.predict(df_test['tweet'])
xy = pd.to_numeric(df_test['Class'])
print('Test Accuracy:'+str(np.mean(p == xy)))

print(metrics.classification_report(xy, p))

print('Confusion Matrix:')
nb_confusion_matrix = metrics.confusion_matrix(xy, p)
print(nb_confusion_matrix)

Test Accuracy:0.593029215787
             precision    recall  f1-score   support

         -1       0.57      0.71      0.63       688
          0       0.58      0.53      0.55       681
          1       0.64      0.53      0.58       582

avg / total       0.60      0.59      0.59      1951

Confusion Matrix:
[[486 126  76]
 [224 360  97]
 [138 133 311]]
