**NLP**

*CC6205-1 - Otoño 2022*

Autor: Felipe Urrutia Vargas

In [21]:
# !pip install --upgrade pandas

import pandas as pd
import numpy as np
# pd.set_option("max_rows", None)
import pickle
from string import punctuation
import re

from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns
# from astropy.visualization import hist
sns.set_theme(style="whitegrid")
plt.rc('axes', titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rcParams.update({'font.size': 16})
plt.rcParams['axes.titlesize'] = 16
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams.update({'lines.markeredgewidth': 1})
plt.rcParams.update({'errorbar.capsize': 2})
import random

import plotly.express as px
import time
import datetime

from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_recall_fscore_support

import os
import shutil

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix, cohen_kappa_score, classification_report, accuracy_score, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
sentiments = "anger fear joy sadness".split()
intensities = "low medium high".split()

In [65]:
summary_baseline = pd.DataFrame({
    "sen": sentiments,
    "auc": [0.62, 0.67, 0.65, 0.67],
    "kappa": [0.07, 0.15, 0.18, 0.19],
    "accuracy": [0.63, 0.57, 0.54, 0.55] 
             }).set_index("sen")

In [4]:
df_train = pickle.load(open("df_train.pickle", "rb"))

In [5]:
df_representation = pickle.load(open("df_representation_v1.pickle", "rb"))

In [6]:
partition_attrib = {
    type_attrib: [c for c in df_representation.columns if type_attrib+"<&>" in c]
    for type_attrib in "retro punct emoji linguistics".split()
}

In [8]:
"summary type attrib", {k: len(v) for k, v in partition_attrib.items()}

('summary type attrib',
 {'retro': 13, 'punct': 7, 'emoji': 190, 'linguistics': 11540})

In [42]:
def auc_score(test_set, predicted_set):
    high_predicted = np.array([prediction[2] for prediction in predicted_set])
    medium_predicted = np.array([prediction[1] for prediction in predicted_set])
    low_predicted = np.array([prediction[0] for prediction in predicted_set])
    high_test = np.where(test_set == 'high', 1.0, 0.0)
    medium_test = np.where(test_set == 'medium', 1.0, 0.0)
    low_test = np.where(test_set == 'low', 1.0, 0.0)
    auc_high = roc_auc_score(high_test, high_predicted)
    auc_med = roc_auc_score(medium_test, medium_predicted)
    auc_low = roc_auc_score(low_test, low_predicted)
    auc_w = (low_test.sum() * auc_low + medium_test.sum() * auc_med +
             high_test.sum() * auc_high) / (
                 low_test.sum() + medium_test.sum() + high_test.sum())
    return auc_w


def evaulate(predicted_probabilities, y_test, labels, dataset_name):
    # Importante: al transformar los arreglos de probabilidad a clases,
    # entregar el arreglo de clases aprendido por el clasificador.
    # (que comunmente, es distinto a ['low', 'medium', 'high'])
    predicted_labels = [
        labels[np.argmax(item)] for item in predicted_probabilities
    ]
    print('Confusion Matrix for {}:\n'.format(dataset_name))
    print(
        confusion_matrix(y_test,
                         predicted_labels,
                         labels=['low', 'medium', 'high']))

    print('\nClassification Report:\n')
    print(
        classification_report(y_test,
                              predicted_labels,
                              labels=['low', 'medium', 'high']))
    # Reorder predicted probabilities array.
    labels = labels.tolist()
    predicted_probabilities = predicted_probabilities[:, [
        labels.index('low'),
        labels.index('medium'),
        labels.index('high')
    ]]
    auc = round(auc_score(y_test, predicted_probabilities), 3)
    print("Scores:\n\nAUC: ", auc, end='\t')
    kappa = round(cohen_kappa_score(y_test, predicted_labels), 3)
    print("Kappa:", kappa, end='\t')
    accuracy = round(accuracy_score(y_test, predicted_labels), 3)
    print("Accuracy:", accuracy)
    print('------------------------------------------------------\n')
    return np.array([auc, kappa, accuracy])

In [84]:
metrics = "auc kappa accuracy".split()
summary = {sen: {l: [] for l in metrics} for sen in sentiments}
N_fits = 100
for sen in sentiments:
    for _ in range(N_fits):
        indexs = df_train[df_train["sen"] == sen].index
        cols_selected_sen = pickle.load(open(f"cols_selected_{sen}_v1.pickle", "rb"))

        X = df_representation.loc[indexs][cols_selected_sen]
        y = df_train.loc[X.index]["int"]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.randint(1, X.shape[0]))
        y_train = y_train.replace({"low": 0, "medium": 1, "high": 2})
        clf = make_pipeline(StandardScaler(), SVC(gamma='auto', class_weight="balanced", probability=True))
        clf.fit(X_train, y_train)
        clf.score(X_test, y_test.replace({"low": 0, "medium": 1, "high": 2}))
        y_pred = clf.predict_proba(X_test)
        predicted_labels = [
            intensities[np.argmax(item)] for item in y_pred
        ]
        auc = round(auc_score(y_test, y_pred), 3)
        print(f"[{sen}] Scores:\n AUC: ", auc, end='\t')
        kappa = round(cohen_kappa_score(y_test, predicted_labels), 3)
        print(f"Kappa:", kappa, end='\t')
        accuracy = round(accuracy_score(y_test, predicted_labels), 3)
        print(f"Accuracy:", accuracy, end="\n\n")
        
        summary[sen]["auc"].append(auc)
        summary[sen]["kappa"].append(kappa)
        summary[sen]["accuracy"].append(accuracy)

[anger] Scores:
 AUC:  0.738	Kappa: 0.316	Accuracy: 0.698

[anger] Scores:
 AUC:  0.769	Kappa: 0.254	Accuracy: 0.693

[anger] Scores:
 AUC:  0.721	Kappa: 0.249	Accuracy: 0.646

[anger] Scores:
 AUC:  0.706	Kappa: 0.335	Accuracy: 0.73

[anger] Scores:
 AUC:  0.686	Kappa: 0.3	Accuracy: 0.677

[anger] Scores:
 AUC:  0.77	Kappa: 0.343	Accuracy: 0.73

[anger] Scores:
 AUC:  0.728	Kappa: 0.302	Accuracy: 0.677

[anger] Scores:
 AUC:  0.757	Kappa: 0.29	Accuracy: 0.677

[anger] Scores:
 AUC:  0.757	Kappa: 0.304	Accuracy: 0.698

[anger] Scores:
 AUC:  0.73	Kappa: 0.213	Accuracy: 0.624

[anger] Scores:
 AUC:  0.691	Kappa: 0.25	Accuracy: 0.688

[anger] Scores:
 AUC:  0.75	Kappa: 0.255	Accuracy: 0.677

[anger] Scores:
 AUC:  0.703	Kappa: 0.33	Accuracy: 0.698

[anger] Scores:
 AUC:  0.706	Kappa: 0.233	Accuracy: 0.635

[anger] Scores:
 AUC:  0.765	Kappa: 0.391	Accuracy: 0.704

[anger] Scores:
 AUC:  0.758	Kappa: 0.36	Accuracy: 0.688

[anger] Scores:
 AUC:  0.743	Kappa: 0.315	Accuracy: 0.704

[anger] 

[fear] Scores:
 AUC:  0.778	Kappa: 0.388	Accuracy: 0.671

[fear] Scores:
 AUC:  0.731	Kappa: 0.303	Accuracy: 0.623

[fear] Scores:
 AUC:  0.798	Kappa: 0.44	Accuracy: 0.694

[fear] Scores:
 AUC:  0.753	Kappa: 0.415	Accuracy: 0.687

[fear] Scores:
 AUC:  0.773	Kappa: 0.416	Accuracy: 0.675

[fear] Scores:
 AUC:  0.759	Kappa: 0.377	Accuracy: 0.651

[fear] Scores:
 AUC:  0.762	Kappa: 0.403	Accuracy: 0.651

[fear] Scores:
 AUC:  0.774	Kappa: 0.405	Accuracy: 0.671

[fear] Scores:
 AUC:  0.773	Kappa: 0.399	Accuracy: 0.663

[fear] Scores:
 AUC:  0.811	Kappa: 0.433	Accuracy: 0.69

[fear] Scores:
 AUC:  0.759	Kappa: 0.389	Accuracy: 0.667

[fear] Scores:
 AUC:  0.7	Kappa: 0.335	Accuracy: 0.667

[fear] Scores:
 AUC:  0.774	Kappa: 0.402	Accuracy: 0.663

[fear] Scores:
 AUC:  0.75	Kappa: 0.365	Accuracy: 0.643

[fear] Scores:
 AUC:  0.739	Kappa: 0.321	Accuracy: 0.639

[fear] Scores:
 AUC:  0.748	Kappa: 0.343	Accuracy: 0.639

[fear] Scores:
 AUC:  0.734	Kappa: 0.417	Accuracy: 0.706

[fear] Scores:
 AUC

[joy] Scores:
 AUC:  0.787	Kappa: 0.336	Accuracy: 0.652

[joy] Scores:
 AUC:  0.791	Kappa: 0.437	Accuracy: 0.707

[joy] Scores:
 AUC:  0.797	Kappa: 0.473	Accuracy: 0.691

[joy] Scores:
 AUC:  0.751	Kappa: 0.35	Accuracy: 0.613

[joy] Scores:
 AUC:  0.797	Kappa: 0.353	Accuracy: 0.63

[joy] Scores:
 AUC:  0.799	Kappa: 0.384	Accuracy: 0.641

[joy] Scores:
 AUC:  0.75	Kappa: 0.311	Accuracy: 0.619

[joy] Scores:
 AUC:  0.793	Kappa: 0.398	Accuracy: 0.663

[joy] Scores:
 AUC:  0.795	Kappa: 0.452	Accuracy: 0.68

[joy] Scores:
 AUC:  0.812	Kappa: 0.439	Accuracy: 0.685

[joy] Scores:
 AUC:  0.788	Kappa: 0.459	Accuracy: 0.685

[joy] Scores:
 AUC:  0.759	Kappa: 0.361	Accuracy: 0.652

[joy] Scores:
 AUC:  0.799	Kappa: 0.438	Accuracy: 0.663

[joy] Scores:
 AUC:  0.761	Kappa: 0.471	Accuracy: 0.691

[joy] Scores:
 AUC:  0.752	Kappa: 0.339	Accuracy: 0.63

[sadness] Scores:
 AUC:  0.7	Kappa: 0.296	Accuracy: 0.587

[sadness] Scores:
 AUC:  0.714	Kappa: 0.259	Accuracy: 0.564

[sadness] Scores:
 AUC:  0.74	

In [94]:
# pickle.dump(summary, open("summary_v1.pickle", "wb"))
pd.DataFrame({l: {sen: round(np.mean(summary[sen][l]), 3) for sen in sentiments} for l in metrics})#.set_index("sen")

Unnamed: 0,auc,kappa,accuracy
anger,0.726,0.281,0.678
fear,0.751,0.365,0.648
joy,0.788,0.41,0.658
sadness,0.718,0.292,0.592


In [95]:
summary_baseline

Unnamed: 0_level_0,auc,kappa,accuracy
sen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
anger,0.62,0.07,0.63
fear,0.67,0.15,0.57
joy,0.65,0.18,0.54
sadness,0.67,0.19,0.55
