# QUALITATIVE EVALUATION

In [2]:
from glob import glob
import re
import pickle
import os
import string
import json

import nltk
from nltk.corpus import stopwords

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

from stemmercleaner import StemmerCleaner

# algorithms
from sklearn.linear_model import LogisticRegression,Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
import numpy as np

In [3]:
tweets = pd.read_csv("data.csv", index_col="tweet_id")
X_raw = StemmerCleaner().fit(tweets.text).transform(tweets.text)
X_raw = X_raw.values
y = pd.read_csv("y.csv", header=None)
y = y[1].values

In [4]:
def show_results_current():
    res = []
    for file in glob('models\\*.model'):
        m_res = pickle.load(open(file, 'rb'))
        res.append(m_res)
    df = pd.DataFrame(res, columns=['name', 'transformer_name', 'model_name', 'score', 'f1','recall','precision', 'model', 'transformer', 'cleaner'])
    df.set_index('name', inplace=True)
    return df

In [5]:
df = show_results_current()
df.sort_values('score', ascending=False)

Unnamed: 0_level_0,transformer_name,model_name,score,f1,recall,precision,model,transformer,cleaner
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
stemmer-countvectorizerdefault-logisticregression-l2,CountVectorizerDefault,LogisticRegression-l2,0.783607,,,,"{'C': 1, 'class_weight': None, 'dual': False, ...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer
stemmer-countvectorizerdefault-linearsvc,CountVectorizerDefault,LinearSVC,0.781694,,,,"{'C': 0.1, 'class_weight': None, 'dual': True,...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer
stemmer-tfidfvectorizerdefault-logisticregression-l2,TfidfVectorizerDefault,LogisticRegression-l2,0.771995,,,,"{'C': 10, 'class_weight': None, 'dual': False,...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer
stemmer-tfidfvectorizerdefault-linearsvc,TfidfVectorizerDefault,LinearSVC,0.769262,,,,"{'C': 1, 'class_weight': None, 'dual': True, '...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer
stemmer-tfidfvectorizerdefault-randomforestclassifier,TfidfVectorizerDefault,RandomForestClassifier,0.767623,,,,"{'bootstrap': True, 'class_weight': None, 'cri...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer
stemmer-countvectorizerdefault-randomforestclassifier,CountVectorizerDefault,RandomForestClassifier,0.767008,,,,"{'bootstrap': True, 'class_weight': None, 'cri...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer
stemmer-countvectorizerdefault-multinomialnb,CountVectorizerDefault,MultinomialNB,0.75321,,,,"{'alpha': 0.5, 'class_prior': None, 'fit_prior...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer
stemmer-countvectorizerdefault-perceptron,CountVectorizerDefault,Perceptron,0.745355,,,,"{'alpha': 0.0001, 'class_weight': None, 'eta0'...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer
stemmer-tfidfvectorizerdefault-perceptron,TfidfVectorizerDefault,Perceptron,0.726844,,,,"{'alpha': 0.0001, 'class_weight': None, 'eta0'...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer
stemmer-tfidfvectorizerdefault-multinomialnb,TfidfVectorizerDefault,MultinomialNB,0.726434,,,,"{'alpha': 0.1, 'class_prior': None, 'fit_prior...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer


We pick the top performing combination to do some exploring:

In [6]:
params = eval(df[df.index == 'stemmer-countvectorizerdefault-logisticregression-l2']['model'].values[0])
print(params)
model = LogisticRegression()
model.set_params(**params)

{'C': 1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'max_iter': 2000, 'multi_class': 'ovr', 'n_jobs': 1, 'penalty': 'l2', 'random_state': None, 'solver': 'sag', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
trans = CountVectorizer()

In [8]:
trans.fit(X_raw)
X = trans.transform(X_raw)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [10]:
model.fit(X, y)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

## TEST NEW TWEETS

In [11]:
t = pd.Series(np.array([
    "I loved my flight",
    "I hated my flight",
    "I flew today"
]))
t_clean = StemmerCleaner().fit(t).transform(t)
t_trans = trans.transform(t_clean)
model.predict(t_trans)

array([ 1, -1,  0], dtype=int64)

Actual tweets

In [12]:
t = pd.Series(np.array([
    "When @VirginAmerica didn't have flights for Vegas. So you decide to give @united a try. #fail",
    "I didn't realize that if the smoke alarm goes off in an airplane bathroom,  @VirginAmerica plane staff protocol is to bang down the door mid pee-stream and berate you loudly so the entire rear cabin can hear, while you're still buttoning up your pants and asking what's going on.",
    "Please do not fly @AmericanAir. When their flights arrive over 1hr 40min late causing me to miss my connection they REFUSED to comp a hotel. Told me to pay myself or sleep in the airport.",
    "The @VirginAmerica lounge at London Heathrow beats all I’ve ever seen. Table service. Full spa. Amazed! ",
    "Thank you, @AlaskaAir, for adopting and integrating @VirginAmerica humour, referencing the “onboard spacious, luxurious lavatories.”"
]))
t_clean = StemmerCleaner().fit(t).transform(t)
t_trans = trans.transform(t_clean)
preds = model.predict(t_trans)

for i in range(len(preds)):
    print(preds[i], t[i])

-1 When @VirginAmerica didn't have flights for Vegas. So you decide to give @united a try. #fail
-1 I didn't realize that if the smoke alarm goes off in an airplane bathroom,  @VirginAmerica plane staff protocol is to bang down the door mid pee-stream and berate you loudly so the entire rear cabin can hear, while you're still buttoning up your pants and asking what's going on.
-1 Please do not fly @AmericanAir. When their flights arrive over 1hr 40min late causing me to miss my connection they REFUSED to comp a hotel. Told me to pay myself or sleep in the airport.
1 The @VirginAmerica lounge at London Heathrow beats all I’ve ever seen. Table service. Full spa. Amazed! 
1 Thank you, @AlaskaAir, for adopting and integrating @VirginAmerica humour, referencing the “onboard spacious, luxurious lavatories.”
