**NLP**

*CC6205-1 - Oto√±o 2022*

Autor: Felipe Urrutia Vargas

In [21]:
# !pip install --upgrade pandas

import pandas as pd
import numpy as np
# pd.set_option("max_rows", None)
import pickle
from string import punctuation
import re

from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns
# from astropy.visualization import hist
sns.set_theme(style="whitegrid")
plt.rc('axes', titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rcParams.update({'font.size': 16})
plt.rcParams['axes.titlesize'] = 16
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams.update({'lines.markeredgewidth': 1})
plt.rcParams.update({'errorbar.capsize': 2})
import random

import plotly.express as px
import time
import datetime

from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_recall_fscore_support

import os
import shutil

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix, cohen_kappa_score, classification_report, accuracy_score, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
sentiments = "anger fear joy sadness".split()
intensities = "low medium high".split()

In [65]:
summary_baseline = pd.DataFrame({
    "sen": sentiments,
    "auc": [0.62, 0.67, 0.65, 0.67],
    "kappa": [0.07, 0.15, 0.18, 0.19],
    "accuracy": [0.63, 0.57, 0.54, 0.55] 
             }).set_index("sen")

In [4]:
df_train = pickle.load(open("df_train.pickle", "rb"))

In [102]:
df_representation = pickle.load(open("df_representation_v1.pickle", "rb"))
df_representation_v2 = pickle.load(open("df_representation_v2.pickle", "rb"))
df_representation_v3 = pickle.load(open("df_representation_v3.pickle", "rb"))

In [6]:
partition_attrib = {
    type_attrib: [c for c in df_representation.columns if type_attrib+"<&>" in c]
    for type_attrib in "retro punct emoji linguistics".split()
}

In [8]:
"summary type attrib", {k: len(v) for k, v in partition_attrib.items()}

('summary type attrib',
 {'retro': 13, 'punct': 7, 'emoji': 190, 'linguistics': 11540})

In [42]:
def auc_score(test_set, predicted_set):
    high_predicted = np.array([prediction[2] for prediction in predicted_set])
    medium_predicted = np.array([prediction[1] for prediction in predicted_set])
    low_predicted = np.array([prediction[0] for prediction in predicted_set])
    high_test = np.where(test_set == 'high', 1.0, 0.0)
    medium_test = np.where(test_set == 'medium', 1.0, 0.0)
    low_test = np.where(test_set == 'low', 1.0, 0.0)
    auc_high = roc_auc_score(high_test, high_predicted)
    auc_med = roc_auc_score(medium_test, medium_predicted)
    auc_low = roc_auc_score(low_test, low_predicted)
    auc_w = (low_test.sum() * auc_low + medium_test.sum() * auc_med +
             high_test.sum() * auc_high) / (
                 low_test.sum() + medium_test.sum() + high_test.sum())
    return auc_w


def evaulate(predicted_probabilities, y_test, labels, dataset_name):
    # Importante: al transformar los arreglos de probabilidad a clases,
    # entregar el arreglo de clases aprendido por el clasificador.
    # (que comunmente, es distinto a ['low', 'medium', 'high'])
    predicted_labels = [
        labels[np.argmax(item)] for item in predicted_probabilities
    ]
    print('Confusion Matrix for {}:\n'.format(dataset_name))
    print(
        confusion_matrix(y_test,
                         predicted_labels,
                         labels=['low', 'medium', 'high']))

    print('\nClassification Report:\n')
    print(
        classification_report(y_test,
                              predicted_labels,
                              labels=['low', 'medium', 'high']))
    # Reorder predicted probabilities array.
    labels = labels.tolist()
    predicted_probabilities = predicted_probabilities[:, [
        labels.index('low'),
        labels.index('medium'),
        labels.index('high')
    ]]
    auc = round(auc_score(y_test, predicted_probabilities), 3)
    print("Scores:\n\nAUC: ", auc, end='\t')
    kappa = round(cohen_kappa_score(y_test, predicted_labels), 3)
    print("Kappa:", kappa, end='\t')
    accuracy = round(accuracy_score(y_test, predicted_labels), 3)
    print("Accuracy:", accuracy)
    print('------------------------------------------------------\n')
    return np.array([auc, kappa, accuracy])

In [104]:
metrics = "auc kappa accuracy".split()
summary = {sen: {l: [] for l in metrics} for sen in sentiments}
N_fits = 100
for sen in sentiments:
    for _ in range(N_fits):
        indexs = df_train[df_train["sen"] == sen].index
        cols_selected_sen = pickle.load(open(f"cols_selected_{sen}_v3.pickle", "rb"))

        X = df_representation_v3.loc[indexs][cols_selected_sen]
        y = df_train.loc[X.index]["int"]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.randint(1, X.shape[0]))
        y_train = y_train.replace({"low": 0, "medium": 1, "high": 2})
        clf = make_pipeline(StandardScaler(), SVC(gamma='auto', class_weight="balanced", probability=True))
        clf.fit(X_train, y_train)
        clf.score(X_test, y_test.replace({"low": 0, "medium": 1, "high": 2}))
        y_pred = clf.predict_proba(X_test)
        predicted_labels = [
            intensities[np.argmax(item)] for item in y_pred
        ]
        auc = round(auc_score(y_test, y_pred), 3)
        print(f"[{sen}] Scores:\n AUC: ", auc, end='\t')
        kappa = round(cohen_kappa_score(y_test, predicted_labels), 3)
        print(f"Kappa:", kappa, end='\t')
        accuracy = round(accuracy_score(y_test, predicted_labels), 3)
        print(f"Accuracy:", accuracy, end="\n\n")
        
        summary[sen]["auc"].append(auc)
        summary[sen]["kappa"].append(kappa)
        summary[sen]["accuracy"].append(accuracy)

[anger] Scores:
 AUC:  0.744	Kappa: 0.278	Accuracy: 0.661

[anger] Scores:
 AUC:  0.773	Kappa: 0.349	Accuracy: 0.698

[anger] Scores:
 AUC:  0.732	Kappa: 0.222	Accuracy: 0.651

[anger] Scores:
 AUC:  0.663	Kappa: 0.211	Accuracy: 0.651

[anger] Scores:
 AUC:  0.814	Kappa: 0.348	Accuracy: 0.704

[anger] Scores:
 AUC:  0.751	Kappa: 0.334	Accuracy: 0.735

[anger] Scores:
 AUC:  0.761	Kappa: 0.281	Accuracy: 0.704

[anger] Scores:
 AUC:  0.758	Kappa: 0.321	Accuracy: 0.688

[anger] Scores:
 AUC:  0.787	Kappa: 0.332	Accuracy: 0.683

[anger] Scores:
 AUC:  0.785	Kappa: 0.324	Accuracy: 0.688

[anger] Scores:
 AUC:  0.79	Kappa: 0.318	Accuracy: 0.709

[anger] Scores:
 AUC:  0.705	Kappa: 0.25	Accuracy: 0.656

[anger] Scores:
 AUC:  0.751	Kappa: 0.294	Accuracy: 0.64

[anger] Scores:
 AUC:  0.789	Kappa: 0.429	Accuracy: 0.783

[anger] Scores:
 AUC:  0.694	Kappa: 0.258	Accuracy: 0.656

[anger] Scores:
 AUC:  0.778	Kappa: 0.384	Accuracy: 0.72

[anger] Scores:
 AUC:  0.719	Kappa: 0.263	Accuracy: 0.646

[

[fear] Scores:
 AUC:  0.753	Kappa: 0.363	Accuracy: 0.623

[fear] Scores:
 AUC:  0.788	Kappa: 0.385	Accuracy: 0.655

[fear] Scores:
 AUC:  0.738	Kappa: 0.384	Accuracy: 0.655

[fear] Scores:
 AUC:  0.772	Kappa: 0.405	Accuracy: 0.679

[fear] Scores:
 AUC:  0.724	Kappa: 0.356	Accuracy: 0.643

[fear] Scores:
 AUC:  0.73	Kappa: 0.392	Accuracy: 0.667

[fear] Scores:
 AUC:  0.733	Kappa: 0.368	Accuracy: 0.635

[fear] Scores:
 AUC:  0.755	Kappa: 0.371	Accuracy: 0.647

[fear] Scores:
 AUC:  0.764	Kappa: 0.388	Accuracy: 0.631

[fear] Scores:
 AUC:  0.752	Kappa: 0.369	Accuracy: 0.639

[fear] Scores:
 AUC:  0.795	Kappa: 0.355	Accuracy: 0.639

[fear] Scores:
 AUC:  0.788	Kappa: 0.445	Accuracy: 0.698

[fear] Scores:
 AUC:  0.79	Kappa: 0.434	Accuracy: 0.667

[fear] Scores:
 AUC:  0.773	Kappa: 0.442	Accuracy: 0.675

[fear] Scores:
 AUC:  0.772	Kappa: 0.422	Accuracy: 0.667

[fear] Scores:
 AUC:  0.792	Kappa: 0.384	Accuracy: 0.647

[fear] Scores:
 AUC:  0.774	Kappa: 0.381	Accuracy: 0.651

[fear] Scores:
 

[joy] Scores:
 AUC:  0.778	Kappa: 0.436	Accuracy: 0.669

[joy] Scores:
 AUC:  0.767	Kappa: 0.329	Accuracy: 0.635

[joy] Scores:
 AUC:  0.787	Kappa: 0.421	Accuracy: 0.657

[joy] Scores:
 AUC:  0.832	Kappa: 0.461	Accuracy: 0.685

[joy] Scores:
 AUC:  0.813	Kappa: 0.471	Accuracy: 0.68

[joy] Scores:
 AUC:  0.806	Kappa: 0.506	Accuracy: 0.696

[joy] Scores:
 AUC:  0.794	Kappa: 0.421	Accuracy: 0.657

[joy] Scores:
 AUC:  0.789	Kappa: 0.404	Accuracy: 0.646

[joy] Scores:
 AUC:  0.806	Kappa: 0.422	Accuracy: 0.674

[joy] Scores:
 AUC:  0.834	Kappa: 0.524	Accuracy: 0.729

[joy] Scores:
 AUC:  0.816	Kappa: 0.511	Accuracy: 0.729

[joy] Scores:
 AUC:  0.726	Kappa: 0.397	Accuracy: 0.641

[joy] Scores:
 AUC:  0.818	Kappa: 0.516	Accuracy: 0.718

[joy] Scores:
 AUC:  0.814	Kappa: 0.455	Accuracy: 0.685

[joy] Scores:
 AUC:  0.778	Kappa: 0.384	Accuracy: 0.657

[sadness] Scores:
 AUC:  0.686	Kappa: 0.215	Accuracy: 0.57

[sadness] Scores:
 AUC:  0.739	Kappa: 0.251	Accuracy: 0.558

[sadness] Scores:
 AUC:  

In [105]:
pickle.dump(summary, open("summary_v3.pickle", "wb"))

In [100]:
summary = pickle.load(open("summary_v1.pickle", "rb"))
pd.DataFrame({l: {sen: round(np.mean(summary[sen][l]), 3) for sen in sentiments} for l in metrics})#.set_index("sen")

Unnamed: 0,auc,kappa,accuracy
anger,0.726,0.281,0.678
fear,0.751,0.365,0.648
joy,0.788,0.41,0.658
sadness,0.718,0.292,0.592


In [101]:
summary = pickle.load(open("summary_v2.pickle", "rb"))
pd.DataFrame({l: {sen: round(np.mean(summary[sen][l]), 3) for sen in sentiments} for l in metrics})#.set_index("sen")

Unnamed: 0,auc,kappa,accuracy
anger,0.767,0.312,0.7
fear,0.781,0.371,0.652
joy,0.802,0.432,0.674
sadness,0.77,0.357,0.631


In [106]:
summary = pickle.load(open("summary_v3.pickle", "rb"))
pd.DataFrame({l: {sen: round(np.mean(summary[sen][l]), 3) for sen in sentiments} for l in metrics})#.set_index("sen")

Unnamed: 0,auc,kappa,accuracy
anger,0.747,0.309,0.684
fear,0.764,0.378,0.652
joy,0.791,0.426,0.666
sadness,0.711,0.29,0.594


In [95]:
summary_baseline

Unnamed: 0_level_0,auc,kappa,accuracy
sen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
anger,0.62,0.07,0.63
fear,0.67,0.15,0.57
joy,0.65,0.18,0.54
sadness,0.67,0.19,0.55
