In [1]:
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

sns.set()

In [2]:
data = pd.read_csv("./data/cleaned.csv")

In [3]:
data = data.dropna()

# Preprocess Data

In [4]:
vectorizer = CountVectorizer()

In [5]:
X = vectorizer.fit_transform(data.cleaned)
y = data.rating > data.rating.median()

In [6]:
trainX, valX, trainy, valy = train_test_split(X, y, train_size=0.8, random_state=486)
valX, testX, valy, testy = train_test_split(valX, valy, train_size=0.5, random_state=486)

# Train SVC

In [7]:
clf = MultinomialNB()
clf.fit(trainX, trainy)

MultinomialNB()

## Evaluate NaiveBayes

In [8]:
f1_score(clf.predict(valX), valy)

0.8148148148148148

In [9]:
accuracy_score(clf.predict(valX), valy)

0.8229342327150084

In [27]:
roc_auc_score(clf.predict(valX), valy)

0.819677850036608

## Features with highest and lowest coefs

It looks like most of the features with extreme coefficients are actually usernames. Let's try to filter those out.

In [10]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, -50:]]

array(['more', 'all', 'strong', 'but', 'color', 'new', 'one', 'our',
       'work', 'lit', 'add', 'you', 'points', 'skype', 'book', 'has',
       'update', 'rooms', 'wall', 'set', 'with', 'pillow', 'this',
       'books', 'lamp', 'nice', 'up', 'flowers', 'light', 'on', 'it',
       'lighting', 'are', 'good', 'depth', 'in', 'well', 'to', 'we',
       'rater', 'plant', 'for', 'of', 'is', 'great', 'and', 'room',
       'love', 'art', 'the'], dtype='<U28')

In [12]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, :50]]

array(['senator_sam', 'thepointsguy', 'oz', 'oystercovefarms', 'gotten',
       'gouging', 'oy', 'gospel', 'theothermandela', 'govbillweld',
       'govchristie', 'govcox', 'theorist', 'govedrendell', 'theories',
       'oxycontin', 'governorpataki', 'gorilla', 'gore', 'goon', 'gop',
       'therachlindsay', 'gopbetrayedamerica', 'gopchairwoman',
       'gopleader', 'ozark', 'gord', 'ozzie4pdxmayor', 'gordjohns',
       'ozymandias', 'gordongchang', 'gordonhumphrey', 'ozolslawfirm',
       'pa', 'googly', 'governors', 'govevers', 'gracecomchurch',
       'gracepotter', 'graciestyle', 'grad', 'owes', 'soniafurstenau',
       'grace_segers', 'grades', 'owe', 'owasow', 'graham', 'grahamdodds',
       'overwhelming'], dtype='<U28')

In [13]:
at_re = re.compile(r"@[\w_]+")
def strip_ats(tweet):
    return at_re.sub("", tweet)

hash_re = re.compile(r"#\w+")
def strip_hashtags(tweet):
    return hash_re.sub("", tweet)

In [14]:
X = vectorizer.fit_transform(data.cleaned.apply(strip_ats).apply(strip_hashtags))
y = data.rating > data.rating.median()

trainX, valX, trainy, valy = train_test_split(X, y, train_size=0.8, random_state=486)
valX, testX, valy, testy = train_test_split(valX, valy, train_size=0.5, random_state=486)

In [28]:
clf.fit(trainX, trainy)
f1_score(clf.predict(valX), valy), accuracy_score(clf.predict(valX), valy), roc_auc_score(clf.predict(valX), valy)

(0.8067675868210151, 0.8170320404721754, 0.819677850036608)

In [22]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, -50:]]

array(['video', 'work', 'depth', 'right', 'too', 'one', 'points',
       'reframe', 'left', 'that', 'book', 'hostage', 'back', 'needs',
       'like', 'great', 'wall', 'lighting', 'not', 'set', 'well', 'lamp',
       'this', 'books', 'you', 'raise', 'with', 'but', 'are', 'more',
       'in', 'it', 'up', 'light', 'nice', 'room', 'on', 'camera', 'for',
       'good', 'plant', 'we', 'of', 'is', 'add', 'love', 'and', 'to',
       'art', 'the'], dtype='<U19')

In [24]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, :50]]

array(['hapless', 'interest', 'intercoms', 'intentionally', 'intent',
       'intense', 'intended', 'intellectuals', 'intellectual',
       'throwback', 'insurgents', 'instument', 'throwing', 'instalment',
       'inspiration', 'thru', 'thst', 'insert', 'insensitive',
       'insufficient', 'insects', 'interplay', 'interpreting', 'involved',
       'invites', 'invisible', 'investor', 'investigating', 'invest',
       'invention', 'invent', 'interpreters', 'invade', 'thrillers',
       'intrepid', 'intimidated', 'interviewing', 'interviewed',
       'intervention', 'interstellar', 'interrogation', 'inukshuk', 'ip',
       'inscribed', 'inquiry', 'increase', 'incorrectly', 'incorporated',
       'incongruously', 'incomplete'], dtype='<U19')

In [38]:
np.round(np.argsort(clf.coef_)[0, :50]/ np.argsort(clf.coef_)[0, 49] * 100)

array([ 90., 102., 102., 102., 102., 102., 102., 102., 102., 198., 102.,
       102., 198., 101., 101., 198., 198., 101., 101., 102., 101., 102.,
       102., 103., 103., 103., 103., 103., 103., 103., 103., 102., 103.,
       198., 102., 102., 102., 102., 102., 102., 102., 103., 103., 101.,
       101., 100., 100., 100., 100., 100.])

In [40]:
df = pd.DataFrame({'value': np.round(np.argsort(clf.coef_)[0, :50]/ np.argsort(clf.coef_)[0, 49] * 100) ,
                  'feature': np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, :50]]})
df.to_csv("lowwordcloud.csv", index=False)