**NLP**

*CC6205-1 - Otoño 2022*

Autor: Felipe Urrutia Vargas

In [2]:
# !pip install --upgrade pandas

import pandas as pd
import numpy as np
# pd.set_option("max_rows", None)
import pickle
from string import punctuation
import re

from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns
# from astropy.visualization import hist
sns.set_theme(style="whitegrid")
plt.rc('axes', titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rcParams.update({'font.size': 16})
plt.rcParams['axes.titlesize'] = 16
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams.update({'lines.markeredgewidth': 1})
plt.rcParams.update({'errorbar.capsize': 2})
import random

import plotly.express as px
import time
import datetime

from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_recall_fscore_support

import os
import shutil

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix, cohen_kappa_score, classification_report, accuracy_score, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
sentiments = "anger fear joy sadness".split()
intensities = "low medium high".split()

In [4]:
summary_baseline = pd.DataFrame({
    "sen": sentiments,
    "auc": [0.62, 0.67, 0.65, 0.67],
    "kappa": [0.07, 0.15, 0.18, 0.19],
    "accuracy": [0.63, 0.57, 0.54, 0.55] 
             }).set_index("sen")

In [5]:
df_train = pickle.load(open("df_train.pickle", "rb"))

In [10]:
df_representation = pickle.load(open("df_representation_v1.pickle", "rb"))
df_representation_v2 = pickle.load(open("df_representation_v2.pickle", "rb"))
df_representation_v3 = pickle.load(open("df_representation_v3.pickle", "rb"))
df_representation_v4 = pickle.load(open("df_representation_v4.pickle", "rb"))

In [13]:
partition_attrib = {
    type_attrib: [c for c in df_representation_v4.columns if type_attrib+"<&>" in c]
    for type_attrib in "retro punct emoji linguistics".split()
}

In [14]:
"summary type attrib", {k: len(v) for k, v in partition_attrib.items()}

('summary type attrib',
 {'retro': 13, 'punct': 7, 'emoji': 190, 'linguistics': 11540})

In [15]:
def auc_score(test_set, predicted_set):
    high_predicted = np.array([prediction[2] for prediction in predicted_set])
    medium_predicted = np.array([prediction[1] for prediction in predicted_set])
    low_predicted = np.array([prediction[0] for prediction in predicted_set])
    high_test = np.where(test_set == 'high', 1.0, 0.0)
    medium_test = np.where(test_set == 'medium', 1.0, 0.0)
    low_test = np.where(test_set == 'low', 1.0, 0.0)
    auc_high = roc_auc_score(high_test, high_predicted)
    auc_med = roc_auc_score(medium_test, medium_predicted)
    auc_low = roc_auc_score(low_test, low_predicted)
    auc_w = (low_test.sum() * auc_low + medium_test.sum() * auc_med +
             high_test.sum() * auc_high) / (
                 low_test.sum() + medium_test.sum() + high_test.sum())
    return auc_w


def evaulate(predicted_probabilities, y_test, labels, dataset_name):
    # Importante: al transformar los arreglos de probabilidad a clases,
    # entregar el arreglo de clases aprendido por el clasificador.
    # (que comunmente, es distinto a ['low', 'medium', 'high'])
    predicted_labels = [
        labels[np.argmax(item)] for item in predicted_probabilities
    ]
    print('Confusion Matrix for {}:\n'.format(dataset_name))
    print(
        confusion_matrix(y_test,
                         predicted_labels,
                         labels=['low', 'medium', 'high']))

    print('\nClassification Report:\n')
    print(
        classification_report(y_test,
                              predicted_labels,
                              labels=['low', 'medium', 'high']))
    # Reorder predicted probabilities array.
    labels = labels.tolist()
    predicted_probabilities = predicted_probabilities[:, [
        labels.index('low'),
        labels.index('medium'),
        labels.index('high')
    ]]
    auc = round(auc_score(y_test, predicted_probabilities), 3)
    print("Scores:\n\nAUC: ", auc, end='\t')
    kappa = round(cohen_kappa_score(y_test, predicted_labels), 3)
    print("Kappa:", kappa, end='\t')
    accuracy = round(accuracy_score(y_test, predicted_labels), 3)
    print("Accuracy:", accuracy)
    print('------------------------------------------------------\n')
    return np.array([auc, kappa, accuracy])

In [26]:
metrics = "auc kappa accuracy".split()
summary = {sen: {l: [] for l in metrics} for sen in sentiments}
N_fits = 100
for sen in sentiments:
    for _ in range(N_fits):
        indexs = df_train[df_train["sen"] == sen].index
        cols_selected_sen = pickle.load(open(f"cols_selected_{sen}_v1.pickle", "rb"))

        X = df_representation.loc[indexs][cols_selected_sen]
        y = df_train.loc[X.index]["int"]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.randint(1, X.shape[0]))
        y_train = y_train.replace({"low": 0, "medium": 1, "high": 2})
        clf = make_pipeline(StandardScaler(), SVC(gamma='auto', class_weight="balanced", probability=True))
        clf.fit(X_train, y_train)
        clf.score(X_test, y_test.replace({"low": 0, "medium": 1, "high": 2}))
        y_pred = clf.predict_proba(X_test)
        predicted_labels = [
            intensities[np.argmax(item)] for item in y_pred
        ]
        auc = round(auc_score(y_test, y_pred), 3)
        print(f"[{sen}] Scores:\n AUC: ", auc, end='\t')
        kappa = round(cohen_kappa_score(y_test, predicted_labels), 3)
        print(f"Kappa:", kappa, end='\t')
        accuracy = round(accuracy_score(y_test, predicted_labels), 3)
        print(f"Accuracy:", accuracy, end="\n\n")
        
        summary[sen]["auc"].append(auc)
        summary[sen]["kappa"].append(kappa)
        summary[sen]["accuracy"].append(accuracy)

[anger] Scores:
 AUC:  0.736	Kappa: 0.358	Accuracy: 0.725

[anger] Scores:
 AUC:  0.748	Kappa: 0.395	Accuracy: 0.704

[anger] Scores:
 AUC:  0.78	Kappa: 0.38	Accuracy: 0.725

[anger] Scores:
 AUC:  0.782	Kappa: 0.383	Accuracy: 0.751

[anger] Scores:
 AUC:  0.805	Kappa: 0.381	Accuracy: 0.73

[anger] Scores:
 AUC:  0.78	Kappa: 0.422	Accuracy: 0.762

[anger] Scores:
 AUC:  0.802	Kappa: 0.458	Accuracy: 0.772

[anger] Scores:
 AUC:  0.765	Kappa: 0.377	Accuracy: 0.746

[anger] Scores:
 AUC:  0.786	Kappa: 0.474	Accuracy: 0.751

[anger] Scores:
 AUC:  0.783	Kappa: 0.518	Accuracy: 0.783

[anger] Scores:
 AUC:  0.843	Kappa: 0.442	Accuracy: 0.767

[anger] Scores:
 AUC:  0.778	Kappa: 0.427	Accuracy: 0.73

[anger] Scores:
 AUC:  0.725	Kappa: 0.364	Accuracy: 0.698

[anger] Scores:
 AUC:  0.772	Kappa: 0.391	Accuracy: 0.714

[anger] Scores:
 AUC:  0.77	Kappa: 0.479	Accuracy: 0.778

[anger] Scores:
 AUC:  0.749	Kappa: 0.348	Accuracy: 0.714

[anger] Scores:
 AUC:  0.752	Kappa: 0.385	Accuracy: 0.735

[an

[fear] Scores:
 AUC:  0.81	Kappa: 0.47	Accuracy: 0.718

[fear] Scores:
 AUC:  0.799	Kappa: 0.417	Accuracy: 0.69

[fear] Scores:
 AUC:  0.801	Kappa: 0.473	Accuracy: 0.71

[fear] Scores:
 AUC:  0.786	Kappa: 0.452	Accuracy: 0.679

[fear] Scores:
 AUC:  0.757	Kappa: 0.422	Accuracy: 0.69

[fear] Scores:
 AUC:  0.809	Kappa: 0.486	Accuracy: 0.706

[fear] Scores:
 AUC:  0.759	Kappa: 0.399	Accuracy: 0.651

[fear] Scores:
 AUC:  0.765	Kappa: 0.392	Accuracy: 0.659

[fear] Scores:
 AUC:  0.832	Kappa: 0.46	Accuracy: 0.694

[fear] Scores:
 AUC:  0.755	Kappa: 0.396	Accuracy: 0.655

[fear] Scores:
 AUC:  0.742	Kappa: 0.364	Accuracy: 0.643

[fear] Scores:
 AUC:  0.811	Kappa: 0.485	Accuracy: 0.726

[fear] Scores:
 AUC:  0.722	Kappa: 0.318	Accuracy: 0.635

[fear] Scores:
 AUC:  0.789	Kappa: 0.448	Accuracy: 0.683

[fear] Scores:
 AUC:  0.804	Kappa: 0.521	Accuracy: 0.726

[fear] Scores:
 AUC:  0.821	Kappa: 0.481	Accuracy: 0.69

[fear] Scores:
 AUC:  0.806	Kappa: 0.451	Accuracy: 0.706

[fear] Scores:
 AUC: 

[joy] Scores:
 AUC:  0.851	Kappa: 0.574	Accuracy: 0.751

[joy] Scores:
 AUC:  0.822	Kappa: 0.485	Accuracy: 0.718

[joy] Scores:
 AUC:  0.809	Kappa: 0.493	Accuracy: 0.707

[joy] Scores:
 AUC:  0.819	Kappa: 0.492	Accuracy: 0.691

[joy] Scores:
 AUC:  0.802	Kappa: 0.405	Accuracy: 0.674

[joy] Scores:
 AUC:  0.78	Kappa: 0.413	Accuracy: 0.702

[joy] Scores:
 AUC:  0.807	Kappa: 0.467	Accuracy: 0.696

[joy] Scores:
 AUC:  0.795	Kappa: 0.452	Accuracy: 0.713

[joy] Scores:
 AUC:  0.761	Kappa: 0.459	Accuracy: 0.685

[joy] Scores:
 AUC:  0.826	Kappa: 0.466	Accuracy: 0.696

[joy] Scores:
 AUC:  0.847	Kappa: 0.482	Accuracy: 0.729

[joy] Scores:
 AUC:  0.826	Kappa: 0.455	Accuracy: 0.674

[joy] Scores:
 AUC:  0.799	Kappa: 0.464	Accuracy: 0.696

[joy] Scores:
 AUC:  0.749	Kappa: 0.374	Accuracy: 0.657

[joy] Scores:
 AUC:  0.771	Kappa: 0.374	Accuracy: 0.646

[sadness] Scores:
 AUC:  0.819	Kappa: 0.603	Accuracy: 0.785

[sadness] Scores:
 AUC:  0.782	Kappa: 0.43	Accuracy: 0.674

[sadness] Scores:
 AUC:  

In [27]:
pickle.dump(summary, open("summary_v1.pickle", "wb"))

In [28]:
summary = pickle.load(open("summary_v1.pickle", "rb"))
pd.DataFrame({l: {sen: round(np.mean(summary[sen][l]), 3) for sen in sentiments} for l in metrics})#.set_index("sen")

Unnamed: 0,auc,kappa,accuracy
anger,0.759,0.409,0.744
fear,0.785,0.435,0.688
joy,0.812,0.47,0.7
sadness,0.758,0.389,0.652


In [25]:
summary = pickle.load(open("summary_v2.pickle", "rb"))
pd.DataFrame({l: {sen: round(np.mean(summary[sen][l]), 3) for sen in sentiments} for l in metrics})#.set_index("sen")

Unnamed: 0,auc,kappa,accuracy
anger,0.783,0.36,0.712
fear,0.8,0.412,0.673
joy,0.811,0.438,0.674
sadness,0.79,0.38,0.64


In [18]:
summary = pickle.load(open("summary_v3.pickle", "rb"))
pd.DataFrame({l: {sen: round(np.mean(summary[sen][l]), 3) for sen in sentiments} for l in metrics})#.set_index("sen")

Unnamed: 0,auc,kappa,accuracy
anger,0.769,0.427,0.751
fear,0.782,0.43,0.687
joy,0.814,0.485,0.71
sadness,0.756,0.387,0.651


In [21]:
summary = pickle.load(open("summary_v4.pickle", "rb"))
pd.DataFrame({l: {sen: round(np.mean(summary[sen][l]), 3) for sen in sentiments} for l in metrics})#.set_index("sen")

Unnamed: 0,auc,kappa,accuracy
anger,0.769,0.419,0.75
fear,0.788,0.432,0.686
joy,0.815,0.481,0.704
sadness,0.753,0.379,0.647


In [95]:
summary_baseline

Unnamed: 0_level_0,auc,kappa,accuracy
sen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
anger,0.62,0.07,0.63
fear,0.67,0.15,0.57
joy,0.65,0.18,0.54
sadness,0.67,0.19,0.55
