In [1]:
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

sns.set()

In [2]:
data = pd.read_csv("./data/cleaned.csv")

In [3]:
data = data.dropna()

# Preprocess Data

In [214]:
vectorizer = CountVectorizer()

In [215]:
X = vectorizer.fit_transform(data.cleaned)
y = data.rating > data.rating.median()

In [216]:
trainX, valX, trainy, valy = train_test_split(X, y, train_size=0.8, random_state=486)
valX, testX, valy, testy = train_test_split(valX, valy, train_size=0.5, random_state=486)

# Train SVC

In [217]:
svc = LinearSVC()
svc.fit(trainX, trainy)

LinearSVC()

## Evaluate SVC

In [218]:
f1_score(valy, svc.predict(valX)), accuracy_score(valy, svc.predict(valX)), roc_auc_score(valy, svc.decision_function(valX))

(0.7948497854077253, 0.7984822934232715, 0.8789218028348464)

In [219]:
f1_score(svc.predict(valX), valy)

0.7948497854077253

In [220]:
accuracy_score(svc.predict(valX), valy)

0.7984822934232715

## Features with highest and lowest coefs

It looks like most of the features with extreme coefficients are actually usernames. Let's try to filter those out.

In [225]:
np.array(vectorizer.get_feature_names())[np.argsort(svc.coef_)[0, -50:]]

array(['hunterhayes', 'hamillhimself', 'jmpalmieri', 'schittscreek',
       'alxthomp', 'maragay', 'doriskgoodwin', 'agbecerra', 'juliahamm',
       'carole_king', 'lenses', 'marktakesphoto', 'advocate', 'telescope',
       'keishabottoms', 'lawyerbobbauer', 'bfinamore', 'jacqehoward',
       'remaining', 'adamwren', 'staircases', 'derricknaacp',
       'lynnoberlander', 'beautifully', 'merrillbro', 'rahmemanuel',
       'previously', 'ron_christie', 'lets', 'radhikajones', 'astro_cady',
       'objets', 'gdebenedetti', 'sweater', 'erickmsanchez', 'texas',
       'johndonvan', 'glorious', 'celinedion', 'bbheathertom',
       'katedicamillo', 'ericswalwell', 'stained', 'identified',
       'torontostar', 'superior', 'mrhollywoodmd', 'governor',
       'ricktelesz', 'meganpormer'], dtype='<U28')

In [226]:
np.array(vectorizer.get_feature_names())[np.argsort(svc.coef_)[0, :50]]

array(['christrapper', 'loganplaster', 'sarahnferris', 'gregstohr',
       'chrishell7', 'andreayoungatl', 'nkechi_taifa', 'costareports',
       'markherringva', 'ctvqp', 'blacksnob', 'kate_manne', 'senatorgill',
       'crying', 'drmcclellan', 'henadoba', 'laurenzelt', 'repjoshg',
       'mpinoe', 'rinsana', 'jamesfortexas', 'niknanos', 'lot',
       'vladduthierscbs', 'mooch', 'rehang', 'docking', 'interplay',
       'rozweston', 'susanlejeuneuk', 'mryangorman', 'ambermcreynolds',
       'domingomorel', 'jordanwitzel', 'deanobeidallah', 'potential',
       'hankazaria', 'declutter', 'juliaioffe', 'chrisdaleoxford',
       'edyong209', 'secret', 'zekejmiller', 'bernadeansteptoe',
       'haleyjoelleott', 'milnerhrich', 'rachaelcobb', 'blairunderwood',
       'brought', 'spread'], dtype='<U28')

In [227]:
at_re = re.compile(r"@[\w_]+")
def strip_ats(tweet):
    return at_re.sub("", tweet)

hash_re = re.compile(r"#\w+")
def strip_hashtags(tweet):
    return hash_re.sub("", tweet)

In [228]:
X = vectorizer.fit_transform(data.cleaned.apply(strip_ats).apply(strip_hashtags))
y = data.rating > data.rating.median()

trainX, valX, trainy, valy = train_test_split(X, y, train_size=0.8, random_state=486)
valX, testX, valy, testy = train_test_split(valX, valy, train_size=0.5, random_state=486)

In [230]:
svc.fit(trainX, trainy)
f1_score(svc.predict(valX), valy), accuracy_score(svc.predict(valX), valy), roc_auc_score(valy, svc.decision_function(valX))

(0.7857142857142857, 0.7875210792580101, 0.8712616317429982)

In [231]:
np.array(vectorizer.get_feature_names())[np.argsort(svc.coef_)[0, -50:]]

array(['fiction', 'scented', 'congratulations', 'bipartisanship',
       'celebrating', 'livable', 'fauci', 'telescope', 'sailing',
       'filled', 'lover', 'bacon', 'schmancy', 'murals', 'chia', 'wutang',
       'chord', 'sweater', 'advocate', 'wow', 'hunger', 'stained',
       'forecast', 'inappropriate', 'china', 'clients', 'daggers',
       'bamboo', 'brooklyn', 'staircases', 'nicest', 'innis', 'texas',
       'labor', 'bartlett', 'felled', 'extremely', 'flawless',
       'previously', 'beautifully', 'remaining', 'governor', 'hitting',
       'glorious', 'arched', 'lets', 'lenses', 'superior', 'identified',
       'objets'], dtype='<U19')

In [232]:
np.array(vectorizer.get_feature_names())[np.argsort(svc.coef_)[0, :50]]

array(['sock', 'sort', 'scandal', 'interplay', 'crying', 'repositon',
       'wants', 'clothes', 'shame', 'paneled', 'stage', 'readjust',
       'alcohol', 'pussywillows', 'kerry', 'vaccinated', 'mooch',
       'styling', 'lot', '19th', 'ching', 'folliage', 'aces', 'austere',
       'invisible', 'nora', 'lose', 'lowet', 'docking', 'aspen',
       'scoreboards', 'reduce', 'recompose', 'fail', 'scientific',
       'narrow', 'dome', 'clearly', 'source', 'indirect', 'tissues',
       'proliferating', 'mybe', 'tan', 'potential', 'spread', 'barely',
       'candlesticks', 'recreation', 'happening'], dtype='<U19')

# Train Decision Tree

In [222]:
rfc = RandomForestClassifier()
rfc.fit(trainX, trainy)

RandomForestClassifier()

In [224]:
(
    f1_score(valy, rfc.predict(valX)),
    accuracy_score(valy, rfc.predict(valX)),
    roc_auc_score(valy, rfc.predict_proba(valX)[:,1])
)

(0.7877145438121048, 0.8018549747048904, 0.887216174095056)