In [25]:
# import all the packages
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [26]:
# load in the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [27]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [28]:
# split out feature and target
y = train.sentiment
X = train.message

In [29]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

In [30]:
# split data
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.3,shuffle=True, stratify=y, random_state=11)

In [31]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_val)



In [32]:
# getting our test set ready
testx = test.message
test_vect = vectorizer.transform(testx)

In [33]:
y_pred = rfc.predict(test_vect)

In [34]:
test['sentiment'] = y_pred

In [35]:
test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,2
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


In [24]:
f1_score(y_test, rfc_pred, average="macro")

0.5494048472156572

In [36]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)

In [37]:
train

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954
...,...,...,...
15814,1,RT @ezlusztig: They took down the material on ...,22001
15815,2,RT @washingtonpost: How climate change could b...,17856
15816,0,notiven: RT: nytimesworld :What does Trump act...,384248
15817,-1,RT @sara8smiles: Hey liberals the climate chan...,819732


In [40]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [48]:
test.sentiment.value_counts()

 1    7259
 2    1646
 0    1345
-1     296
Name: sentiment, dtype: int64

In [50]:
names = ['Logistic Regression', 'Multinomial NB',
         'Random Forest Classifier', 'Ada Boost Classifier',
         'LinearSVC']

classifiers = [
    LogisticRegression(),
    MultinomialNB(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    LinearSVC()
]

In [52]:
from sklearn import metrics

In [58]:
results = []

models = {}
confusion = {}
class_report = {}


for name, clf in zip(names, classifiers):
    print ('Fitting {:s} model...'.format(name))
    run_time = %timeit -q -o clf.fit(X_train, y_train)

    print ('... predicting')
    y_pred = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)

    print ('... scoring')
    accuracy  = metrics.accuracy_score(y_train, y_pred)
    precision = metrics.precision_score(y_train, y_pred,average='macro')
    recall    = metrics.recall_score(y_train, y_pred,average='macro')

    f1        = metrics.f1_score(y_train, y_pred,average='macro')
    f1_test   = metrics.f1_score(y_test, y_pred_test,average='macro')

    # Save the results to dictionaries
    models[name] = clf
    confusion[name] = metrics.confusion_matrix(y_train, y_pred)
    class_report[name] = metrics.classification_report(y_train, y_pred)

    results.append([name, accuracy, precision, recall, f1, f1_test, run_time.best])


results = pd.DataFrame(results, columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Train', 'F1 Test', 'Train Time'])
results.set_index('Classifier', inplace= True)

Fitting Logistic Regression model...
... predicting
... scoring
Fitting Multinomial NB model...
... predicting
... scoring
Fitting Random Forest Classifier model...




... predicting
... scoring
Fitting Ada Boost Classifier model...
... predicting
... scoring
Fitting LinearSVC model...
... predicting
... scoring


In [59]:
results.sort_values('F1 Train', ascending=False)

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Train,F1 Test,Train Time
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LinearSVC,0.987718,0.990208,0.982592,0.986338,0.643597,0.283139
Random Forest Classifier,0.981035,0.987766,0.972393,0.979868,0.546662,3.327471
Logistic Regression,0.811524,0.881756,0.651366,0.709054,0.552788,0.308354
Multinomial NB,0.7465,0.895333,0.528136,0.572624,0.456802,0.006848
Ada Boost Classifier,0.6384,0.645844,0.47295,0.51115,0.472957,9.434471


In [84]:
# Linear SVC
svc = LinearSVC()
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_val)


In [63]:
y_pred = svc.predict(test_vect)
test['sentiment'] = y_pred

In [64]:
f1_score(y_test, svc_pred, average="macro")

0.6435966458730891

In [65]:
test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


In [66]:
from sklearn.model_selection import GridSearchCV

In [74]:
nfolds = [1,2,4,6,8,10]
Cs = [0.001,0.01,0.1,1,10]

param_grid = {'C':Cs}

In [77]:
grid_svm = GridSearchCV(estimator=svc,param_grid = param_grid,cv=2)
grid_svm.fit(X_train,y_train)

GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [78]:
grid_svm.best_params_

{'C': 1}

In [79]:
test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


In [80]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)

In [85]:
from sklearn.model_selection import cross_val_score

In [88]:
model = models['LinearSVC']
print(cross_val_score(model,X_train,y_train))



[0.73192526 0.72818428 0.74254743]
