In [75]:
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn import preprocessing

sns.set()

In [76]:
data = pd.read_csv("./data/cleaned.csv")

In [77]:
data = data.dropna()

# Preprocess Data

In [78]:
vectorizer = CountVectorizer()

In [106]:
X = vectorizer.fit_transform(data.cleaned)
y = data.rating > data.rating.median()

In [107]:
trainX, valX, trainy, valy = train_test_split(X, y, train_size=0.8, random_state=486)
valX, testX, valy, testy = train_test_split(valX, valy, train_size=0.5, random_state=486)

In [108]:
scaler = preprocessing.StandardScaler(with_mean=False).fit(trainX)
scaledTrainX = scaler.transform(trainX)

# Train model

In [109]:
clf = LogisticRegression(random_state=486, max_iter=1000)
clf.fit(trainX, trainy)

LogisticRegression(max_iter=1000, random_state=486)

## Evaluate model

In [110]:
f1_score(valy, clf.predict(valX))

0.7985611510791368

In [111]:
accuracy_score(valy, clf.predict(valX))

0.810970464135021

In [112]:
roc_auc_score(valy, clf.decision_function(valX))

0.9033992597600522

## Features with highest and lowest coefs

It looks like most of the features with extreme coefficients are actually usernames. Let's try to filter those out.

In [91]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, -50:]]

array(['sahilkapur', 'congratulations', 'kitty', 'glorious', 'improved',
       'previously', 'nudge', 'great', 'jamescarville', 'never',
       'morningshowca', 'textures', 'david', 'coming', 'trustfauci',
       'philiprucker', 'composition', 'memoriam', 'amazing', 'stained',
       'aaronrodgers12', 'royalfamily', 'rbg', 'cake', 'timodc', 'hero',
       'beschlossdc', 'happy', 'done', 'pooch', 'nailed', 'excellent',
       'flawless', 'bobpisani', 'however', 'governor', 'perfection',
       'rater', 'gorgeous', 'ten', 'perfect', 'music', 'pineapple',
       'joebiden', 'aweissmann_', 'thank', 'historic', 'support', 'win',
       'justintrudeau'], dtype='<U28')

In [92]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, :50]]

array(['hostage', 'needs', 'donnydeutsch', 'add', 'issue', 'reposition',
       'lot', 'deduction', 'stagey', 'potential', 'rotten', 'decent',
       'basic', 'staged', 'costareports', 'declutter', 'sorry', 'empty',
       'bigger', 'poor', 'narrow', 'bad', 'raise', 'blank', 'missing',
       'move', 'fail', 'trump', 'elements', 'interplay', 'interesting',
       'check', 'need', 'poorly', 'doesn', 'rooster', 'wants', 'deal',
       'ear', 'blinds', 'show', 'used', 'pics', 'progress', 'pull', 'too',
       'worse', 'ctvqp', 'microwave', 'consider'], dtype='<U28')

In [93]:
at_re = re.compile(r"@[\w_]+")
def strip_ats(tweet):
    return at_re.sub("", tweet)

hash_re = re.compile(r"#\w+")
def strip_hashtags(tweet):
    return hash_re.sub("", tweet)

In [94]:
X = vectorizer.fit_transform(data.cleaned.apply(strip_ats).apply(strip_hashtags))
y = data.rating > data.rating.median()

trainX, valX, trainy, valy = train_test_split(X, y, train_size=0.8, random_state=486)
valX, testX, valy, testy = train_test_split(valX, valy, train_size=0.5, random_state=486)

In [96]:
clf.fit(trainX, trainy)
f1_score(valy, clf.predict(valX)), accuracy_score(valy, clf.predict(valX)), roc_auc_score(valy, clf.decision_function(valX))

(0.7888589398023361, 0.8016877637130801, 0.89466190743563)

In [97]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, -50:]]

array(['sweater', 'nicest', 'revisiting', 'brilliant', 'closer', 'nudge',
       'texas', 'never', 'kitty', 'fluffy', 'composition', 'wow',
       'memoriam', 'legend', 'glorious', 'david', 'born', 'message',
       'pooch', 'nailed', 'grace', 'congratulations', 'stained', 'done',
       'rater', 'hero', 'coming', 'previously', 'perfect', 'textures',
       'amazing', 'happy', 'rbg', 'ten', 'excellent', 'governor', 'fresh',
       'pineapple', 'remains', 'innis', 'flawless', 'perfection',
       'gorgeous', 'however', 'music', 'cake', 'thank', 'historic',
       'support', 'win'], dtype='<U19')

In [98]:
np.array(vectorizer.get_feature_names())[np.argsort(clf.coef_)[0, :50]]

array(['hostage', 'needs', 'rotten', 'add', 'issue', 'lot', 'reposition',
       'stagey', 'deduction', 'staged', 'sorry', 'potential', 'decent',
       'interplay', 'basic', 'narrow', 'poor', 'declutter', 'fail',
       'empty', 'blank', 'sock', 'rooster', 'bigger', 'wants', 'used',
       'trump', 'deal', 'missing', 'worse', 'need', 'poorly', 'fight',
       'doesn', 'blinds', 'raise', 'laundry', 'interesting', 'microwave',
       'recreation', 'move', 'bad', 'pics', 'dumb', 'consider',
       'repositon', 'barely', 'counter', 'dark', 'ear'], dtype='<U19')