# Import Data and Libraries

In [1]:
import re # for regular expressions
import pandas as pd 
pd.set_option("display.max_colwidth", 200)
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk # for text manipulation
import warnings 
import gensim

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
from sklearn import metrics

import random
random.seed(10) #choose seed to make results consistent and allow for tweaking

In [2]:
#import data
train  = pd.read_csv('train_E6oV3lV.csv')
test = pd.read_csv('test_tweets_anuFYb8.csv')

In [3]:
train.shape, test.shape

((31962, 3), (17197, 2))

In [4]:
train.head(10)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo
6,7,0,@user camping tomorrow @user @user @user @user @user @user @user dannyâ¦
7,8,0,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #hate #imagine #actorslife #revolutionschool #girl
8,9,0,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦
9,10,0,@user @user welcome here ! i'm it's so #gr8 !


# Split Data, Clean, and Vectorize with Multiple Methods

In [5]:
test.head(10)

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedication #willpower to find #newmaterialsâ¦
1,31964,@user #white #supremacists want everyone to see the new â #birdsâ #movie â and hereâs why
2,31965,safe ways to heal your #acne!! #altwaystoheal #healthy #healing!!
3,31966,"is the hp and the cursed child book up for reservations already? if yes, where? if no, when? ððð #harrypotter #pottermore #favorite"
4,31967,"3rd #bihday to my amazing, hilarious #nephew eli ahmir! uncle dave loves you and missesâ¦"
5,31968,choose to be :) #momtips
6,31969,something inside me dies ð¦ð¿â¨ eyes ness #smokeyeyes #tired #lonely #sof #grungeâ¦
7,31970,#finished#tattoo#inked#ink#loveitâ¤ï¸ #â¤ï¸â¤ï¸â¤ï¸â¤ï¸ #thanks#aleeee !!!
8,31971,@user @user @user i will never understand why my dad left me when i was so young.... :/ #deep #inthefeels
9,31972,#delicious #food #lovelife #capetown mannaepicure #resturantâ¦


In [6]:
X = train.tweet
y = train.label
X_pred = test.tweet

# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .01)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(X_pred.shape)

(31642,)
(320,)
(31642,)
(320,)
(17197,)


First, create Bag of Words data

In [7]:
# instantiate the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer() #tweaking settings to be more restrictive made for worse results

In [8]:
# combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm   # examine the document-term matrix

<31642x41139 sparse matrix of type '<class 'numpy.int64'>'
	with 363220 stored elements in Compressed Sparse Row format>

In [9]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_pred_dtm = vect.transform(X_pred) #prepare prediction data
X_test_dtm

<320x41139 sparse matrix of type '<class 'numpy.int64'>'
	with 3404 stored elements in Compressed Sparse Row format>

Next, create TD-IDF files

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer() #max_df=0.90, min_df=2, max_features=1000, stop_words='english'
xtrain_tfidf = tfidf_vectorizer.fit_transform(X_train)
xtest_tfidf = tfidf_vectorizer.transform(X_test)
xpred_tfidf = tfidf_vectorizer.transform(X_pred) #prepare prediction data
#tfidf.shape

# Build Naive Bayes Model

Start with Bag of Words

In [11]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [12]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

Wall time: 21 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
# Test accuracy and get confusion matrix

# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)
# calculate accuracy of class predictions
print(metrics.accuracy_score(y_test, y_pred_class))
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

0.95625


array([[301,   1],
       [ 13,   5]], dtype=int64)

In [14]:
#create test data set prediction
test_pred = nb.predict(X_pred_dtm)
test['label'] = test_pred
submission = test[['id','label']]
submission.to_csv('sub_nb_bow.csv', index=False) # writing data to a CSV file

Now do TD-IDF

In [15]:
#train
%time nb.fit(xtrain_tfidf, y_train)

Wall time: 17.7 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
# Test accuracy and get confusion matrix

# make class predictions for xtest_tfidf
y_pred_tdidf = nb.predict(xtest_tfidf)
# calculate accuracy of class predictions
print(metrics.accuracy_score(y_test, y_pred_tdidf))
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_tdidf)

0.946875


array([[302,   0],
       [ 17,   1]], dtype=int64)

In [17]:
#create test data set prediction
test_pred_tdidf = nb.predict(xpred_tfidf)
test['label'] = test_pred_tdidf
submission = test[['id','label']]
submission.to_csv('sub_nb_tdidf.csv', index=False) # writing data to a CSV file

# Build Logistic-Regression Model

Start with Bag of Words

In [18]:
from sklearn.linear_model import LogisticRegression


In [19]:
lreg = LogisticRegression(solver = 'liblinear')
%time lreg.fit(X_train_dtm, y_train) # training the model

prediction = lreg.predict_proba(X_test_dtm) # predicting on the validation set
prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction_int.astype(np.int)
print(metrics.accuracy_score(y_test, prediction_int))
# print the confusion matrix
metrics.confusion_matrix(y_test, prediction_int)

Wall time: 1.61 s
0.9625


array([[299,   3],
       [  9,   9]], dtype=int64)

In [20]:
test_pred = lreg.predict_proba(X_pred_dtm)
test_pred_int = test_pred[:,1] >= 0.3
test_pred_int = test_pred_int.astype(np.int)
test['label'] = test_pred_int
submission = test[['id','label']]
submission.to_csv('sub_lreg_bow.csv', index=False) # writing data to a CSV file

Next do TD-IDF

In [21]:
%time lreg.fit(xtrain_tfidf, y_train)

prediction_tdidf = lreg.predict_proba(xtest_tfidf)
prediction_int_tdidf = prediction_tdidf[:,1] >= 0.3
prediction_int_tdidf = prediction_int_tdidf.astype(np.int)
print(metrics.accuracy_score(y_test, prediction_int_tdidf))
# print the confusion matrix
metrics.confusion_matrix(y_test, prediction_int_tdidf)

Wall time: 342 ms
0.965625


array([[301,   1],
       [ 10,   8]], dtype=int64)

In [22]:
test_pred_tdidf = lreg.predict_proba(xpred_tfidf)
test_pred_int_tdidf = test_pred_tdidf[:,1] >= 0.3
test_pred_int_tdidf = test_pred_int_tdidf.astype(np.int)
test['label'] = test_pred_int_tdidf
submission = test[['id','label']]
submission.to_csv('sub_lreg_tdidf.csv', index=False) # writing data to a CSV file

# Build SVM Model

Bag of Words First

In [23]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(kernel='linear', gamma='auto') #
%time SVM.fit(X_train_dtm, y_train)

Wall time: 1min 56s


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [24]:
# Test accuracy and get confusion matrix

# make class predictions for xtest_tfidf
y_pred_SVM = SVM.predict(X_test_dtm)
# calculate accuracy of class predictions
print(metrics.accuracy_score(y_test, y_pred_SVM))
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_SVM)

0.98125


array([[302,   0],
       [  6,  12]], dtype=int64)

In [25]:
#create test data set prediction
test_pred_svm = SVM.predict(X_pred_dtm)
test['label'] = test_pred_svm
submission = test[['id','label']]
submission.to_csv('sub_svm_bow.csv', index=False) # writing data to a CSV file

Next TD-IDF

In [26]:
#fit training data
%time SVM.fit(xtrain_tfidf, y_train)

Wall time: 1min 24s


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [27]:
# Test accuracy and get confusion matrix

# make class predictions for xtest_tfidf
y_pred_tdidf = SVM.predict(xtest_tfidf)
# calculate accuracy of class predictions
print(metrics.accuracy_score(y_test, y_pred_tdidf))
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_tdidf)

0.96875


array([[302,   0],
       [ 10,   8]], dtype=int64)

In [28]:
#create test data set prediction
test_pred_svm = SVM.predict(xpred_tfidf)
test['label'] = test_pred_svm
submission = test[['id','label']]
submission.to_csv('sub_svm_tdidf.csv', index=False) # writing data to a CSV file