In [32]:
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn import preprocessing

sns.set()

In [33]:
data = pd.read_csv("./data/cleaned.csv")

In [34]:
data = data.dropna()

# Preprocess Data

In [35]:
vectorizer = CountVectorizer()

In [36]:
X = vectorizer.fit_transform(data.cleaned)
y = data.rating > data.rating.median()

In [37]:
trainX, valX, trainy, valy = train_test_split(X, y, train_size=0.8, random_state=486)
valX, testX, valy, testy = train_test_split(valX, valy, train_size=0.5, random_state=486)

In [38]:
scaler = preprocessing.StandardScaler(with_mean=False).fit(trainX)
scaledTrainX = scaler.transform(trainX)

# Train SVC

In [39]:
clf = LogisticRegression(random_state=486, max_iter=1000)
clf.fit(trainX, trainy)

LogisticRegression(max_iter=1000, random_state=486)

## Evaluate model

In [40]:
valy

12228     True
5566     False
4964     False
7351      True
3191      True
         ...  
922      False
5307     False
6399     False
4297     False
9488     False
Name: rating, Length: 1186, dtype: bool

In [41]:
f1_score(valy, clf.decision_function(valX) > 0)

0.814495254529767

In [42]:
accuracy_score(valy, clf.decision_function(valX) > 0)

0.8187183811129848

In [43]:
roc_auc_score(valy, clf.decision_function(valX))

0.9043381566673492

## Features with highest and lowest coefs

It looks like most of the features with extreme coefficients are actually usernames. Let's try to filter those out.

In [44]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, -50:]]

array(['power', 'born', 'composition', 'wait', 'beautiful', 'hero',
       'clairecmc', 'perfection', 'sharp', 'philiprucker', 'statue',
       'curated', 'rbg', 'texas', 'previously', 'schittscreek',
       'aaronrodgers12', 'nailed', 'however', 'fresh', 'pooch',
       'royalfamily', 'excellent', 'boy', 'beautifully', 'music',
       'glorious', 'bobpisani', 'rater', 'perfectly', 'superior',
       'pineapple', 'wow', 'closer', 'flawless', 'support', 'timodc',
       'happy', 'amazing', 'done', 'nudge', 'ten', 'gorgeous', 'governor',
       'joebiden', 'aweissmann_', 'thank', 'historic', 'justintrudeau',
       'win'], dtype='<U28')

In [45]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, :50]]

array(['needs', 'hostage', 'add', 'issue', 'decent', 'potential', 'basic',
       'lot', 'reposition', 'costareports', 'bigger', 'change',
       'deduction', 'donnydeutsch', 'rotten', 'copier', 'need', 'poor',
       'crying', 'declutter', 'fail', 'side', 'wants', 'trump', 'put',
       'narrow', 'christrapper', 'elements', 'interesting', 'staged',
       'blank', 'stagey', 'acceptable', 'must', 'next', 'move', 'copies',
       'interplay', 'solid', 'pull', 'shame', 'pics', 'missing', 'bad',
       'trying', 'vanity', 'raise', 'lost', 'worse', 'evangelical'],
      dtype='<U28')

In [46]:
at_re = re.compile(r"@[\w_]+")
def strip_ats(tweet):
    return at_re.sub("", tweet)

hash_re = re.compile(r"#\w+")
def strip_hashtags(tweet):
    return hash_re.sub("", tweet)

In [47]:
X = vectorizer.fit_transform(data.cleaned.apply(strip_ats).apply(strip_hashtags))
y = data.rating > data.rating.median()

trainX, valX, trainy, valy = train_test_split(X, y, train_size=0.8, random_state=486)
valX, testX, valy, testy = train_test_split(valX, valy, train_size=0.5, random_state=486)

In [48]:
clf.fit(trainX, trainy)
f1_score(valy, clf.decision_function(valX) > 0), accuracy_score(valy, clf.decision_function(valX) > 0), roc_auc_score(valy, clf.decision_function(valX))

(0.8141135972461273, 0.8178752107925801, 0.900002559552249)

In [49]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, -50:]]

array(['congratulations', 'hydrangeas', 'favorite', 'coming', 'sweater',
       'hero', 'curated', 'grace', 'slight', 'elegance', 'texas',
       'justice', 'textures', 'pooch', 'pineapples', 'sharp', 'rater',
       'fauci', 'lighted', 'perfection', 'statue', 'staircases', 'music',
       'pineapple', 'boy', 'excellent', 'rbg', 'previously', 'wait',
       'closer', 'perfectly', 'however', 'glorious', 'beautifully',
       'support', 'nudge', 'wow', 'done', 'innis', 'superior', 'ten',
       'happy', 'flawless', 'amazing', 'fresh', 'governor', 'gorgeous',
       'historic', 'thank', 'win'], dtype='<U19')

In [50]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, :50]]

array(['needs', 'hostage', 'rotten', 'add', 'decent', 'potential',
       'issue', 'lot', 'basic', 'reposition', 'wants', 'fail', 'poor',
       'change', 'crying', 'interplay', 'copier', 'bigger', 'declutter',
       'need', 'narrow', 'deduction', 'trump', 'blank', 'staged',
       'evangelical', 'sock', 'worse', 'side', 'put', 'acceptable',
       'stagey', 'next', 'must', 'vanity', 'interesting', 'copies',
       'elements', 'vaccinated', 'missing', 'sorry', 'trying', 'shame',
       'revisit', 'pics', 'repositon', 'clearly', 'lose', 'move',
       'consider'], dtype='<U19')