# QUANTITATIVE RESULTS

In [1]:
from glob import glob
import re
import pickle
import os
import string
import json

import nltk
from nltk.corpus import stopwords

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

from stemmercleaner import StemmerCleaner

# algorithms
from sklearn.linear_model import LogisticRegression,Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
import numpy as np

In [2]:
tweets = pd.read_csv("data.csv", index_col="tweet_id")
X_raw = StemmerCleaner().fit(tweets.text).transform(tweets.text)

In [3]:
def show_results_current():
    res = []
    for file in glob('models\\*.model'):
        m_res = pickle.load(open(file, 'rb'))
        res.append(m_res)
    df = pd.DataFrame(res, columns=['name', 'transformer_name', 'model_name', 'score', 'f1','recall','precision', 'model', 'transformer', 'cleaner'])
    df.set_index('name', inplace=True)
    return df

In [4]:
df = show_results_current()
df.head()

Unnamed: 0_level_0,transformer_name,model_name,score,f1,recall,precision,model,transformer,cleaner
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
stemmer-countvectorizerdefault-decisiontreeclassifier,CountVectorizerDefault,DecisionTreeClassifier,0.692828,,,,"{'class_weight': None, 'criterion': 'gini', 'm...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer
stemmer-countvectorizerdefault-linearsvc,CountVectorizerDefault,LinearSVC,0.781694,,,,"{'C': 0.1, 'class_weight': None, 'dual': True,...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer
stemmer-countvectorizerdefault-logisticregression-l2,CountVectorizerDefault,LogisticRegression-l2,0.783607,,,,"{'C': 1, 'class_weight': None, 'dual': False, ...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer
stemmer-countvectorizerdefault-multinomialnb,CountVectorizerDefault,MultinomialNB,0.75321,,,,"{'alpha': 0.5, 'class_prior': None, 'fit_prior...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer
stemmer-countvectorizerdefault-perceptron,CountVectorizerDefault,Perceptron,0.745355,,,,"{'alpha': 0.0001, 'class_weight': None, 'eta0'...","{'analyzer': 'word', 'binary': False, 'decode_...",stemmer


In [5]:
y = pd.read_csv("y.csv", index_col=0, header=None)
y.head()
y = y[1].values

In [6]:
class_labels = [-1, 0, 1]

In [7]:
df_storage_keys = ['f1', 'recall', 'precision']
for k in df_storage_keys:
    df[k] = None

ONLY RUN THIS IF YOU DON'T HAVE THE DATA ALREADY 

In [8]:
part4csv = "part4.csv"

In [9]:
def results_exist(i, row, storage_key):
    if not os.path.exists(part4csv):
        return False
    
    df = pd.read_csv(part4csv, index_col='name')
    
    entry = df.loc[i]
    
    path1 = os.path.join('class_reports', '%s.class_report' %storage_key)
    path2 = os.path.join('conf_matrices', '%s.conf_matrix' %storage_key)
    
    if pd.isnull(entry['precision']) or pd.isnull(entry['recall']) or pd.isnull(entry['f1']) or  \
                         not os.path.exists(path1) or not os.path.exists(path2):
        return False
                         
    return True

In [10]:
for i, row in df.iterrows():
    # obtain params
    if row['transformer_name'] == 'doc2vec' and row['model_name'] != 'RandomForestClassifier':
        try:
            m_params = row.model[row['model_name']].get_params()
        except Exception as e:
            m_params = eval(row.model)['steps'][0][1].get_params()
    elif row['transformer_name'] == 'doc2vec' and row['model_name'] == 'RandomForestClassifier':
        m_params = row['model']
    else:
        m_params = row['model']
        m_params = eval(m_params)
    
    # build model
    if row['model_name'] == 'DecisionTreeClassifier':
        model = DecisionTreeClassifier()
        
    elif row['model_name'] == 'LogisticRegression-l2':
        model = LogisticRegression()
        
    elif row['model_name'] == 'LinearSVC':
        model = LinearSVC()
        
    elif row['model_name'] == 'DecisionTreeClassifier':
        model = DecisionTreeClassifier()
        
    elif row['model_name'] == 'Perceptron':
        model = Perceptron()
        
    elif row['model_name'] == 'RandomForestClassifier':
        model = RandomForestClassifier()
    
    elif row['model_name'] == 'MultinomialNB':
        model = MultinomialNB()
        
    else:
        raise Exception()
        
    model.set_params(**m_params)
        
    # trans. params
    if row['transformer_name'] == 'TfidfVectorizerDefault':
        trans = TfidfVectorizer()
        
    elif row['transformer_name'] == 'CountVectorizerDefault':
        trans = CountVectorizer()

    storage_key = '%s-%s' %(row['transformer_name'], row['model_name'])
        
    if not results_exist(i, row, storage_key):

        trans.fit(X_raw)
        X = trans.transform(X_raw)

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

    #     print(model)
    #     print(trans)
        print('Transformer = %s' %row['transformer_name'])
        print('Model = %s' %row['model_name'])

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        f1 = f1_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted')
        classreport = classification_report(y_test, y_pred)
        confmatrix = confusion_matrix(y_test, y_pred)
        print("precision = %s" %precision)
        print("recall = %s" %recall)
        print("f1 = %s" %f1)
        print("confusion matrix = \n%s" %confmatrix)
        print("classification report = \n%s" %classreport)
        print("="*20)

        df.loc[i, 'f1'] = f1
        df.loc[i, 'recall'] = recall
        df.loc[i, 'precision'] = precision
        df.to_csv("part4.csv", encoding="utf8")

        if not os.path.exists('class_reports'):
            os.mkdir('class_reports')

        if not os.path.exists('conf_matrices'):
            os.mkdir('conf_matrices')

        pickle.dump(classreport, open(os.path.join('class_reports', '%s.class_report' %storage_key), 'wb'))

        pickle.dump(confmatrix.tolist(), open(os.path.join('conf_matrices', '%s.conf_matrix' %storage_key), 'wb'))
        
    print('%s exists' %storage_key)


CountVectorizerDefault-DecisionTreeClassifier exists
CountVectorizerDefault-LinearSVC exists
CountVectorizerDefault-LogisticRegression-l2 exists
CountVectorizerDefault-MultinomialNB exists
CountVectorizerDefault-Perceptron exists
CountVectorizerDefault-RandomForestClassifier exists
doc2vec-DecisionTreeClassifier exists
doc2vec-LinearSVC exists
doc2vec-LogisticRegression-l2 exists
doc2vec-Perceptron exists
doc2vec-RandomForestClassifier exists
TfidfVectorizerDefault-DecisionTreeClassifier exists
TfidfVectorizerDefault-LinearSVC exists
TfidfVectorizerDefault-LogisticRegression-l2 exists
TfidfVectorizerDefault-MultinomialNB exists
TfidfVectorizerDefault-Perceptron exists
TfidfVectorizerDefault-RandomForestClassifier exists


## PRECISION. RECALL. F1 TABLE

In [11]:
df = pd.read_csv("part4.csv", index_col='name')

In [12]:
df[['transformer_name', 'model_name', 'score', 'f1', 'recall', 'precision']].sort_values('f1',ascending=False)

Unnamed: 0_level_0,transformer_name,model_name,score,f1,recall,precision
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
stemmer-countvectorizerdefault-logisticregression-l2,CountVectorizerDefault,LogisticRegression-l2,0.783607,0.790633,0.795082,0.78983
stemmer-countvectorizerdefault-linearsvc,CountVectorizerDefault,LinearSVC,0.781694,0.78843,0.792691,0.78716
stemmer-tfidfvectorizerdefault-logisticregression-l2,TfidfVectorizerDefault,LogisticRegression-l2,0.771995,0.782844,0.788934,0.782689
stemmer-tfidfvectorizerdefault-linearsvc,TfidfVectorizerDefault,LinearSVC,0.769262,0.777532,0.78347,0.776689
stemmer-tfidfvectorizerdefault-randomforestclassifier,TfidfVectorizerDefault,RandomForestClassifier,0.767623,0.765465,0.779372,0.771714
stemmer-countvectorizerdefault-randomforestclassifier,CountVectorizerDefault,RandomForestClassifier,0.767008,0.76228,0.771516,0.762299
stemmer-countvectorizerdefault-multinomialnb,CountVectorizerDefault,MultinomialNB,0.75321,0.762075,0.769809,0.759313
stemmer-tfidfvectorizerdefault-perceptron,TfidfVectorizerDefault,Perceptron,0.726844,0.75362,0.755464,0.752151
stemmer-doc2vec-logisticregression-l2,doc2vec,LogisticRegression-l2,0.627937,0.751548,0.751366,0.751914
stemmer-countvectorizerdefault-perceptron,CountVectorizerDefault,Perceptron,0.745355,0.75038,0.751366,0.750385


In [13]:
print(df[['transformer_name', 'model_name', 'score', 'f1', 'recall', 'precision']].sort_values('f1',ascending=False).to_latex())

\begin{tabular}{lllrrrr}
\toprule
{} &        transformer\_name &              model\_name &     score &        f1 &    recall &  precision \\
name                                               &                         &                         &           &           &           &            \\
\midrule
stemmer-countvectorizerdefault-logisticregressi... &  CountVectorizerDefault &   LogisticRegression-l2 &  0.783607 &  0.790633 &  0.795082 &   0.789830 \\
stemmer-countvectorizerdefault-linearsvc           &  CountVectorizerDefault &               LinearSVC &  0.781694 &  0.788430 &  0.792691 &   0.787160 \\
stemmer-tfidfvectorizerdefault-logisticregressi... &  TfidfVectorizerDefault &   LogisticRegression-l2 &  0.771995 &  0.782844 &  0.788934 &   0.782689 \\
stemmer-tfidfvectorizerdefault-linearsvc           &  TfidfVectorizerDefault &               LinearSVC &  0.769262 &  0.777532 &  0.783470 &   0.776689 \\
stemmer-tfidfvectorizerdefault-randomforestclas... &  TfidfVectorizerDefa

## CONFUSION MATRICES APPENDIX

In [14]:
conf_matr_f = glob(os.path.join('conf_matrices', "*.conf_matrix"))
for f in conf_matr_f:
    data = pickle.load(open(f, 'rb'))
    print("\\textbf{%s}" %f.split('.conf_matrix')[0])
    print("")
    
    df = pd.DataFrame(data, columns=['negative', 'neutral','positive'])
    df.set_index(df.columns, inplace=True)
    print(df.to_latex())
    
    print("")
    print("")

\textbf{conf_matrices\CountVectorizerDefault-DecisionTreeClassifier}

\begin{tabular}{lrrr}
\toprule
{} &  negative &  neutral &  positive \\
\midrule
negative &      1766 &       13 &        57 \\
neutral  &       549 &       29 &        42 \\
positive &       213 &        8 &       251 \\
\bottomrule
\end{tabular}



\textbf{conf_matrices\CountVectorizerDefault-LinearSVC}

\begin{tabular}{lrrr}
\toprule
{} &  negative &  neutral &  positive \\
\midrule
negative &      1652 &      135 &        49 \\
neutral  &       205 &      366 &        49 \\
positive &        98 &       71 &       303 \\
\bottomrule
\end{tabular}



\textbf{conf_matrices\CountVectorizerDefault-LogisticRegression-l2}

\begin{tabular}{lrrr}
\toprule
{} &  negative &  neutral &  positive \\
\midrule
negative &      1659 &      133 &        44 \\
neutral  &       205 &      369 &        46 \\
positive &        99 &       73 &       300 \\
\bottomrule
\end{tabular}



\textbf{conf_matrices\CountVectorizerDefault-Multin

## CLASSIFICATION REPORTS

In [15]:
class_rep_f = glob(os.path.join('class_reports',"*.class_report"))
for f in class_rep_f:
    data = pickle.load(open(f, 'rb'))
    print("\\textbf{%s}" %f.split('.class_report')[0])
    print("")
    
    print("```")
    print(data)
    print("```")
    
    print("")
    print("")

\textbf{class_reports\CountVectorizerDefault-DecisionTreeClassifier}

```
             precision    recall  f1-score   support

         -1       0.70      0.96      0.81      1836
          0       0.58      0.05      0.09       620
          1       0.72      0.53      0.61       472

avg / total       0.68      0.70      0.62      2928

```


\textbf{class_reports\CountVectorizerDefault-LinearSVC}

```
             precision    recall  f1-score   support

         -1       0.85      0.90      0.87      1836
          0       0.64      0.59      0.61       620
          1       0.76      0.64      0.69       472

avg / total       0.79      0.79      0.79      2928

```


\textbf{class_reports\CountVectorizerDefault-LogisticRegression-l2}

```
             precision    recall  f1-score   support

         -1       0.85      0.90      0.87      1836
          0       0.64      0.60      0.62       620
          1       0.77      0.64      0.70       472

avg / total       0.79      0.