In [140]:
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

sns.set()

In [141]:
data = pd.read_csv("./data/cleaned.csv")

In [142]:
data = data.dropna()

# Preprocess Data

In [162]:
vectorizer = CountVectorizer()

In [163]:
X = vectorizer.fit_transform(data.no_stopwords)
y = data.rating > data.rating.median()

In [164]:
trainX, valX, trainy, valy = train_test_split(X, y, train_size=0.8, random_state=486)
valX, testX, valy, testy = train_test_split(valX, valy, train_size=0.5, random_state=486)

# Train SVC

In [165]:
clf = MultinomialNB()
clf.fit(trainX, trainy)

MultinomialNB()

## Evaluate NaiveBayes

In [166]:
f1_score(valy, clf.predict(valX))

0.8107606679035251

In [167]:
accuracy_score(valy, clf.predict(valX))

0.8278481012658228

In [168]:
roc_auc_score(valy, clf.predict_proba(valX)[:, 1])

0.9052555890407818

## Features with highest and lowest coefs

It looks like most of the features with extreme coefficients are actually usernames. Let's try to filter those out.

In [169]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, -50:]]

array(['happy', 'so', 'not', 'like', 'blue', 'win', 'we', 'composition',
       'day', 'angle', 'perfect', 'game', 'back', 'lovely', 'chair',
       'camera', 'it', 'pillows', 'view', 'more', 'strong', 'pineapple',
       'color', 'new', 'one', 'work', 'lit', 'points', 'book', 'add',
       'update', 'wall', 'skype', 'rooms', 'set', 'pillow', 'lamp',
       'books', 'flowers', 'nice', 'up', 'light', 'lighting', 'good',
       'depth', 'well', 'plant', 'great', 'love', 'art'], dtype='<U28')

In [170]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, :50]]

array(['kidnappers', 'relight', 'gwaynemiller', 'relieved', 'reliably',
       'gwlichtenstein', 'gwtvcrossfire', 'religion', 'relevant',
       'gymjordan', 'relevance', 'relegated', 'hackingdave', 'releases',
       'hadasthier', 'relevancy', 'hades', 'guypratt', 'religious',
       'gunnut', 'remembers', 'remedial', 'guru', 'gussy',
       'remaxadamcontos', 'guylafleur', 'guston', 'remaining', 'guts',
       'remainders', 'remainder', 'relocation', 'guyfieri', 'remake',
       'hadinili', 'release', 'hagarchemali', 'halted', 'reign', 'ham',
       'reidepstein', 'reichlinmelnick', 'hamilton_lane', 'reigns',
       'hammer', 'rehang', 'hamper', 'hampshire', 'rehabilitation',
       'hamster'], dtype='<U28')

In [171]:
at_re = re.compile(r"@[\w_]+")
def strip_ats(tweet):
    return at_re.sub("", tweet)

hash_re = re.compile(r"#\w+")
def strip_hashtags(tweet):
    return hash_re.sub("", tweet)

In [153]:
X = vectorizer.fit_transform(data.cleaned.apply(strip_ats).apply(strip_hashtags))
y = data.rating > data.rating.median()

trainX, valX, trainy, valy = train_test_split(X, y, train_size=0.8, random_state=486)
valX, testX, valy, testy = train_test_split(valX, valy, train_size=0.5, random_state=486)

In [172]:
clf.fit(trainX, trainy)
f1_score(valy, clf.predict(valX)), accuracy_score(valy, clf.predict(valX)), roc_auc_score(valy, clf.predict_proba(valX)[:, 1])

(0.8107606679035251, 0.8278481012658228, 0.9052555890407818)

In [173]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, -50:]]

array(['happy', 'so', 'not', 'like', 'blue', 'win', 'we', 'composition',
       'day', 'angle', 'perfect', 'game', 'back', 'lovely', 'chair',
       'camera', 'it', 'pillows', 'view', 'more', 'strong', 'pineapple',
       'color', 'new', 'one', 'work', 'lit', 'points', 'book', 'add',
       'update', 'wall', 'skype', 'rooms', 'set', 'pillow', 'lamp',
       'books', 'flowers', 'nice', 'up', 'light', 'lighting', 'good',
       'depth', 'well', 'plant', 'great', 'love', 'art'], dtype='<U28')

In [174]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, :50]]

array(['kidnappers', 'relight', 'gwaynemiller', 'relieved', 'reliably',
       'gwlichtenstein', 'gwtvcrossfire', 'religion', 'relevant',
       'gymjordan', 'relevance', 'relegated', 'hackingdave', 'releases',
       'hadasthier', 'relevancy', 'hades', 'guypratt', 'religious',
       'gunnut', 'remembers', 'remedial', 'guru', 'gussy',
       'remaxadamcontos', 'guylafleur', 'guston', 'remaining', 'guts',
       'remainders', 'remainder', 'relocation', 'guyfieri', 'remake',
       'hadinili', 'release', 'hagarchemali', 'halted', 'reign', 'ham',
       'reidepstein', 'reichlinmelnick', 'hamilton_lane', 'reigns',
       'hammer', 'rehang', 'hamper', 'hampshire', 'rehabilitation',
       'hamster'], dtype='<U28')

In [175]:
np.round(np.argsort(clf.coef_)[0, :50]/ np.argsort(clf.coef_)[0, 49] * 100)

array([129., 192.,  99., 192., 192.,  99.,  99., 192., 192.,  99., 192.,
       192.,  99., 192.,  99., 192.,  99.,  99., 192.,  99., 193., 193.,
        99.,  99., 192.,  99.,  99., 192.,  99., 192., 192., 192.,  99.,
       192.,  99., 192.,  99., 100., 192., 100., 192., 192., 100., 192.,
       100., 192., 100., 100., 192., 100.])