In [92]:
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

sns.set()

In [4]:
data = pd.read_csv("./data/cleaned.csv")

In [7]:
data = data.dropna()

# Preprocess Data

In [42]:
vectorizer = CountVectorizer()

In [43]:
X = vectorizer.fit_transform(data.cleaned)
y = data.rating > data.rating.median()

In [44]:
trainX, valX, trainy, valy = train_test_split(X, y, train_size=0.8, random_state=486)
valX, testX, valy, testy = train_test_split(valX, valy, train_size=0.5, random_state=486)

# Train SVC

In [45]:
svc = LinearSVC()
svc.fit(trainX, trainy)

LinearSVC()

## Evaluate SVC

In [57]:
f1_score(svc.predict(valX), valy)

0.7948497854077253

In [58]:
accuracy_score(svc.predict(valX), valy)

0.7984822934232715

## Features with highest and lowest coefs

It looks like most of the features with extreme coefficients are actually usernames. Let's try to filter those out.

In [82]:
np.array(vectorizer.get_feature_names())[np.argsort(svc.coef_)[0, -50:]]

array(['hunterhayes', 'hamillhimself', 'jmpalmieri', 'schittscreek',
       'alxthomp', 'maragay', 'doriskgoodwin', 'agbecerra', 'juliahamm',
       'carole_king', 'lenses', 'marktakesphoto', 'advocate', 'telescope',
       'keishabottoms', 'lawyerbobbauer', 'bfinamore', 'jacqehoward',
       'remaining', 'adamwren', 'staircases', 'derricknaacp',
       'lynnoberlander', 'beautifully', 'merrillbro', 'rahmemanuel',
       'previously', 'ron_christie', 'lets', 'radhikajones', 'astro_cady',
       'objets', 'gdebenedetti', 'sweater', 'erickmsanchez', 'texas',
       'johndonvan', 'glorious', 'celinedion', 'bbheathertom',
       'katedicamillo', 'ericswalwell', 'stained', 'identified',
       'torontostar', 'superior', 'mrhollywoodmd', 'governor',
       'ricktelesz', 'meganpormer'], dtype='<U28')

In [83]:
np.array(vectorizer.get_feature_names())[np.argsort(svc.coef_)[0, :50]]

array(['christrapper', 'loganplaster', 'sarahnferris', 'gregstohr',
       'chrishell7', 'andreayoungatl', 'nkechi_taifa', 'costareports',
       'markherringva', 'ctvqp', 'blacksnob', 'kate_manne', 'senatorgill',
       'crying', 'drmcclellan', 'henadoba', 'laurenzelt', 'repjoshg',
       'mpinoe', 'rinsana', 'jamesfortexas', 'niknanos', 'lot',
       'vladduthierscbs', 'mooch', 'rehang', 'docking', 'interplay',
       'rozweston', 'susanlejeuneuk', 'mryangorman', 'ambermcreynolds',
       'domingomorel', 'jordanwitzel', 'deanobeidallah', 'potential',
       'hankazaria', 'declutter', 'juliaioffe', 'chrisdaleoxford',
       'edyong209', 'secret', 'zekejmiller', 'bernadeansteptoe',
       'haleyjoelleott', 'milnerhrich', 'rachaelcobb', 'blairunderwood',
       'brought', 'spread'], dtype='<U28')

In [121]:
at_re = re.compile(r"@[\w_]+")
def strip_ats(tweet):
    return at_re.sub("", tweet)

hash_re = re.compile(r"#\w+")
def strip_hashtags(tweet):
    return hash_re.sub("", tweet)

In [123]:
X = vectorizer.fit_transform(data.cleaned.apply(strip_ats).apply(strip_hashtags))
y = data.rating > data.rating.median()

trainX, valX, trainy, valy = train_test_split(X, y, train_size=0.8, random_state=486)
valX, testX, valy, testy = train_test_split(valX, valy, train_size=0.5, random_state=486)

SyntaxError: invalid syntax (<ipython-input-123-773f2e913f9b>, line 2)

In [115]:
svc.fit(trainX, trainy)
f1_score(svc.predict(valX), valy), accuracy_score(svc.predict(valX), valy)

(0.7846153846153847, 0.7875210792580101)

In [116]:
np.array(vectorizer.get_feature_names())[np.argsort(svc.coef_)[0, -50:]]

array(['bacon', 'models', 'livable', 'brava', 'acceptible',
       'bipartisanship', 'telescope', 'murals', 'fiction', 'wishes',
       'scented', 'sailing', 'lover', 'filled', 'schmancy', 'ringer',
       'stained', 'chia', 'wow', 'sweater', 'advocate', 'clients',
       'forecast', 'china', 'labor', 'inappropriate', 'texas', 'bamboo',
       'daggers', 'innis', 'brooklyn', 'tapestry', 'staircases', 'felled',
       'ricktelesz', 'nicest', 'governor', 'beautifully', 'glorious',
       'flawless', 'previously', 'remaining', 'arched', 'chriswallace',
       'hitting', 'lenses', 'lets', 'superior', 'identified', 'objets'],
      dtype='<U28')

In [117]:
np.array(vectorizer.get_feature_names())[np.argsort(svc.coef_)[0, :50]]

array(['sock', 'sort', 'interplay', 'crying', 'repositon', 'shame',
       'clothes', 'wants', 'paneled', 'lot', 'readjust', 'drmcclellan',
       'pussywillows', 'mooch', 'alcohol', 'kerry', 'vaccinated',
       'styling', 'stage', 'nora', 'ching', 'invisible', 'lowet', 'lose',
       'folliage', 'aces', '19th', 'aspen', 'docking', 'scientific',
       'reduce', 'fail', 'recompose', 'source', 'mybe', 'scoreboards',
       'proliferating', 'dome', 'closed', 'potential', 'tissues', 'thru',
       'candlesticks', 'narrow', 'indirect', 'format', 'sparse', 'flows',
       'uninspired', 'accompaniment'], dtype='<U28')