**NLP**

*CC6205-1 - Oto√±o 2022*

Autor: Felipe Urrutia Vargas

In [2]:
# !pip install --upgrade pandas

import pandas as pd
import numpy as np
# pd.set_option("max_rows", None)
import pickle
from string import punctuation
import re

from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns
# from astropy.visualization import hist
sns.set_theme(style="whitegrid")
plt.rc('axes', titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rcParams.update({'font.size': 16})
plt.rcParams['axes.titlesize'] = 16
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams.update({'lines.markeredgewidth': 1})
plt.rcParams.update({'errorbar.capsize': 2})
import random

import plotly.express as px
import time
import datetime

from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_recall_fscore_support

import os
import shutil

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix, cohen_kappa_score, classification_report, accuracy_score, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
sentiments = "anger fear joy sadness".split()
intensities = "low medium high".split()

In [4]:
summary_baseline = pd.DataFrame({
    "sen": sentiments,
    "auc": [0.62, 0.67, 0.65, 0.67],
    "kappa": [0.07, 0.15, 0.18, 0.19],
    "accuracy": [0.63, 0.57, 0.54, 0.55] 
             }).set_index("sen")

In [5]:
df_train = pickle.load(open("df_train.pickle", "rb"))

In [10]:
df_representation = pickle.load(open("df_representation_v1.pickle", "rb"))
df_representation_v2 = pickle.load(open("df_representation_v2.pickle", "rb"))
df_representation_v3 = pickle.load(open("df_representation_v3.pickle", "rb"))
df_representation_v4 = pickle.load(open("df_representation_v4.pickle", "rb"))

In [13]:
partition_attrib = {
    type_attrib: [c for c in df_representation_v4.columns if type_attrib+"<&>" in c]
    for type_attrib in "retro punct emoji linguistics".split()
}

In [14]:
"summary type attrib", {k: len(v) for k, v in partition_attrib.items()}

('summary type attrib',
 {'retro': 13, 'punct': 7, 'emoji': 190, 'linguistics': 11540})

In [15]:
def auc_score(test_set, predicted_set):
    high_predicted = np.array([prediction[2] for prediction in predicted_set])
    medium_predicted = np.array([prediction[1] for prediction in predicted_set])
    low_predicted = np.array([prediction[0] for prediction in predicted_set])
    high_test = np.where(test_set == 'high', 1.0, 0.0)
    medium_test = np.where(test_set == 'medium', 1.0, 0.0)
    low_test = np.where(test_set == 'low', 1.0, 0.0)
    auc_high = roc_auc_score(high_test, high_predicted)
    auc_med = roc_auc_score(medium_test, medium_predicted)
    auc_low = roc_auc_score(low_test, low_predicted)
    auc_w = (low_test.sum() * auc_low + medium_test.sum() * auc_med +
             high_test.sum() * auc_high) / (
                 low_test.sum() + medium_test.sum() + high_test.sum())
    return auc_w


def evaulate(predicted_probabilities, y_test, labels, dataset_name):
    # Importante: al transformar los arreglos de probabilidad a clases,
    # entregar el arreglo de clases aprendido por el clasificador.
    # (que comunmente, es distinto a ['low', 'medium', 'high'])
    predicted_labels = [
        labels[np.argmax(item)] for item in predicted_probabilities
    ]
    print('Confusion Matrix for {}:\n'.format(dataset_name))
    print(
        confusion_matrix(y_test,
                         predicted_labels,
                         labels=['low', 'medium', 'high']))

    print('\nClassification Report:\n')
    print(
        classification_report(y_test,
                              predicted_labels,
                              labels=['low', 'medium', 'high']))
    # Reorder predicted probabilities array.
    labels = labels.tolist()
    predicted_probabilities = predicted_probabilities[:, [
        labels.index('low'),
        labels.index('medium'),
        labels.index('high')
    ]]
    auc = round(auc_score(y_test, predicted_probabilities), 3)
    print("Scores:\n\nAUC: ", auc, end='\t')
    kappa = round(cohen_kappa_score(y_test, predicted_labels), 3)
    print("Kappa:", kappa, end='\t')
    accuracy = round(accuracy_score(y_test, predicted_labels), 3)
    print("Accuracy:", accuracy)
    print('------------------------------------------------------\n')
    return np.array([auc, kappa, accuracy])

In [29]:
metrics = "auc kappa accuracy".split()
for num_v in [1, 3, 4]:
    print(num_v)
    df_rep = pickle.load(open(f"df_representation_v{num_v}_1.pickle", "rb"))
    summary = {sen: {l: [] for l in metrics} for sen in sentiments}
    N_fits = 100
    for sen in sentiments:
        for _ in range(N_fits):
            indexs = df_train[df_train["sen"] == sen].index
            cols_selected_sen = pickle.load(open(f"cols_selected_{sen}_v{num_v}_1.pickle", "rb"))

            X = df_rep.loc[indexs][cols_selected_sen]
            y = df_train.loc[X.index]["int"]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.randint(1, X.shape[0]))
            y_train = y_train.replace({"low": 0, "medium": 1, "high": 2})
            clf = make_pipeline(StandardScaler(), SVC(gamma='auto', class_weight="balanced", probability=True))
            clf.fit(X_train, y_train)
            clf.score(X_test, y_test.replace({"low": 0, "medium": 1, "high": 2}))
            y_pred = clf.predict_proba(X_test)
            predicted_labels = [
                intensities[np.argmax(item)] for item in y_pred
            ]
            auc = round(auc_score(y_test, y_pred), 3)
            print(f"[{sen}] Scores:\n AUC: ", auc, end='\t')
            kappa = round(cohen_kappa_score(y_test, predicted_labels), 3)
            print(f"Kappa:", kappa, end='\t')
            accuracy = round(accuracy_score(y_test, predicted_labels), 3)
            print(f"Accuracy:", accuracy, end="\n\n")

            summary[sen]["auc"].append(auc)
            summary[sen]["kappa"].append(kappa)
            summary[sen]["accuracy"].append(accuracy)
    pickle.dump(summary, open(f"summary_v{num_v}_1.pickle", "wb"))

1
[anger] Scores:
 AUC:  0.731	Kappa: 0.42	Accuracy: 0.746

[anger] Scores:
 AUC:  0.782	Kappa: 0.496	Accuracy: 0.794

[anger] Scores:
 AUC:  0.77	Kappa: 0.475	Accuracy: 0.757

[anger] Scores:
 AUC:  0.763	Kappa: 0.379	Accuracy: 0.751

[anger] Scores:
 AUC:  0.725	Kappa: 0.367	Accuracy: 0.714

[anger] Scores:
 AUC:  0.757	Kappa: 0.413	Accuracy: 0.757

[anger] Scores:
 AUC:  0.787	Kappa: 0.434	Accuracy: 0.709

[anger] Scores:
 AUC:  0.797	Kappa: 0.408	Accuracy: 0.735

[anger] Scores:
 AUC:  0.796	Kappa: 0.433	Accuracy: 0.751

[anger] Scores:
 AUC:  0.739	Kappa: 0.473	Accuracy: 0.751

[anger] Scores:
 AUC:  0.774	Kappa: 0.498	Accuracy: 0.778

[anger] Scores:
 AUC:  0.767	Kappa: 0.498	Accuracy: 0.783

[anger] Scores:
 AUC:  0.78	Kappa: 0.473	Accuracy: 0.751

[anger] Scores:
 AUC:  0.801	Kappa: 0.503	Accuracy: 0.778

[anger] Scores:
 AUC:  0.721	Kappa: 0.328	Accuracy: 0.725

[anger] Scores:
 AUC:  0.754	Kappa: 0.467	Accuracy: 0.772

[anger] Scores:
 AUC:  0.78	Kappa: 0.428	Accuracy: 0.762


[fear] Scores:
 AUC:  0.781	Kappa: 0.446	Accuracy: 0.706

[fear] Scores:
 AUC:  0.8	Kappa: 0.462	Accuracy: 0.698

[fear] Scores:
 AUC:  0.765	Kappa: 0.408	Accuracy: 0.667

[fear] Scores:
 AUC:  0.813	Kappa: 0.472	Accuracy: 0.71

[fear] Scores:
 AUC:  0.781	Kappa: 0.422	Accuracy: 0.687

[fear] Scores:
 AUC:  0.799	Kappa: 0.471	Accuracy: 0.734

[fear] Scores:
 AUC:  0.785	Kappa: 0.454	Accuracy: 0.714

[fear] Scores:
 AUC:  0.773	Kappa: 0.466	Accuracy: 0.69

[fear] Scores:
 AUC:  0.784	Kappa: 0.486	Accuracy: 0.75

[fear] Scores:
 AUC:  0.729	Kappa: 0.376	Accuracy: 0.675

[fear] Scores:
 AUC:  0.807	Kappa: 0.403	Accuracy: 0.675

[fear] Scores:
 AUC:  0.787	Kappa: 0.409	Accuracy: 0.694

[fear] Scores:
 AUC:  0.824	Kappa: 0.538	Accuracy: 0.758

[fear] Scores:
 AUC:  0.796	Kappa: 0.431	Accuracy: 0.706

[fear] Scores:
 AUC:  0.817	Kappa: 0.502	Accuracy: 0.73

[fear] Scores:
 AUC:  0.786	Kappa: 0.412	Accuracy: 0.702

[fear] Scores:
 AUC:  0.792	Kappa: 0.45	Accuracy: 0.69

[fear] Scores:
 AUC:  

[joy] Scores:
 AUC:  0.862	Kappa: 0.606	Accuracy: 0.79

[joy] Scores:
 AUC:  0.871	Kappa: 0.579	Accuracy: 0.751

[joy] Scores:
 AUC:  0.793	Kappa: 0.487	Accuracy: 0.718

[joy] Scores:
 AUC:  0.813	Kappa: 0.466	Accuracy: 0.691

[joy] Scores:
 AUC:  0.867	Kappa: 0.529	Accuracy: 0.74

[joy] Scores:
 AUC:  0.865	Kappa: 0.597	Accuracy: 0.779

[joy] Scores:
 AUC:  0.853	Kappa: 0.539	Accuracy: 0.768

[joy] Scores:
 AUC:  0.819	Kappa: 0.448	Accuracy: 0.68

[joy] Scores:
 AUC:  0.793	Kappa: 0.476	Accuracy: 0.702

[joy] Scores:
 AUC:  0.863	Kappa: 0.577	Accuracy: 0.768

[joy] Scores:
 AUC:  0.803	Kappa: 0.527	Accuracy: 0.729

[joy] Scores:
 AUC:  0.829	Kappa: 0.52	Accuracy: 0.724

[joy] Scores:
 AUC:  0.864	Kappa: 0.559	Accuracy: 0.746

[joy] Scores:
 AUC:  0.815	Kappa: 0.503	Accuracy: 0.702

[joy] Scores:
 AUC:  0.843	Kappa: 0.595	Accuracy: 0.757

[sadness] Scores:
 AUC:  0.789	Kappa: 0.488	Accuracy: 0.686

[sadness] Scores:
 AUC:  0.805	Kappa: 0.474	Accuracy: 0.698

[sadness] Scores:
 AUC:  0.

[anger] Scores:
 AUC:  0.806	Kappa: 0.559	Accuracy: 0.81

[anger] Scores:
 AUC:  0.738	Kappa: 0.366	Accuracy: 0.762

[anger] Scores:
 AUC:  0.787	Kappa: 0.552	Accuracy: 0.804

[anger] Scores:
 AUC:  0.795	Kappa: 0.447	Accuracy: 0.714

[anger] Scores:
 AUC:  0.758	Kappa: 0.441	Accuracy: 0.788

[anger] Scores:
 AUC:  0.816	Kappa: 0.548	Accuracy: 0.788

[anger] Scores:
 AUC:  0.765	Kappa: 0.485	Accuracy: 0.767

[anger] Scores:
 AUC:  0.823	Kappa: 0.576	Accuracy: 0.804

[anger] Scores:
 AUC:  0.742	Kappa: 0.38	Accuracy: 0.757

[anger] Scores:
 AUC:  0.786	Kappa: 0.472	Accuracy: 0.751

[anger] Scores:
 AUC:  0.766	Kappa: 0.422	Accuracy: 0.751

[anger] Scores:
 AUC:  0.77	Kappa: 0.416	Accuracy: 0.735

[anger] Scores:
 AUC:  0.756	Kappa: 0.424	Accuracy: 0.762

[anger] Scores:
 AUC:  0.823	Kappa: 0.532	Accuracy: 0.778

[anger] Scores:
 AUC:  0.799	Kappa: 0.45	Accuracy: 0.762

[anger] Scores:
 AUC:  0.813	Kappa: 0.44	Accuracy: 0.757

[anger] Scores:
 AUC:  0.781	Kappa: 0.466	Accuracy: 0.741

[a

[fear] Scores:
 AUC:  0.807	Kappa: 0.503	Accuracy: 0.73

[fear] Scores:
 AUC:  0.787	Kappa: 0.427	Accuracy: 0.706

[fear] Scores:
 AUC:  0.792	Kappa: 0.45	Accuracy: 0.69

[fear] Scores:
 AUC:  0.813	Kappa: 0.497	Accuracy: 0.718

[fear] Scores:
 AUC:  0.81	Kappa: 0.475	Accuracy: 0.706

[fear] Scores:
 AUC:  0.792	Kappa: 0.492	Accuracy: 0.718

[fear] Scores:
 AUC:  0.768	Kappa: 0.512	Accuracy: 0.754

[fear] Scores:
 AUC:  0.786	Kappa: 0.451	Accuracy: 0.718

[fear] Scores:
 AUC:  0.786	Kappa: 0.46	Accuracy: 0.718

[fear] Scores:
 AUC:  0.834	Kappa: 0.542	Accuracy: 0.758

[fear] Scores:
 AUC:  0.807	Kappa: 0.508	Accuracy: 0.742

[fear] Scores:
 AUC:  0.806	Kappa: 0.485	Accuracy: 0.738

[fear] Scores:
 AUC:  0.824	Kappa: 0.522	Accuracy: 0.75

[fear] Scores:
 AUC:  0.811	Kappa: 0.467	Accuracy: 0.726

[fear] Scores:
 AUC:  0.802	Kappa: 0.504	Accuracy: 0.746

[fear] Scores:
 AUC:  0.781	Kappa: 0.388	Accuracy: 0.663

[fear] Scores:
 AUC:  0.798	Kappa: 0.476	Accuracy: 0.73

[fear] Scores:
 AUC: 

[sadness] Scores:
 AUC:  0.771	Kappa: 0.464	Accuracy: 0.686

[sadness] Scores:
 AUC:  0.786	Kappa: 0.506	Accuracy: 0.721

[sadness] Scores:
 AUC:  0.767	Kappa: 0.413	Accuracy: 0.657

[sadness] Scores:
 AUC:  0.804	Kappa: 0.503	Accuracy: 0.698

[sadness] Scores:
 AUC:  0.837	Kappa: 0.561	Accuracy: 0.762

[sadness] Scores:
 AUC:  0.799	Kappa: 0.453	Accuracy: 0.68

[sadness] Scores:
 AUC:  0.742	Kappa: 0.446	Accuracy: 0.709

[sadness] Scores:
 AUC:  0.739	Kappa: 0.409	Accuracy: 0.657

[sadness] Scores:
 AUC:  0.763	Kappa: 0.477	Accuracy: 0.698

[sadness] Scores:
 AUC:  0.735	Kappa: 0.44	Accuracy: 0.686

[sadness] Scores:
 AUC:  0.727	Kappa: 0.348	Accuracy: 0.605

[sadness] Scores:
 AUC:  0.814	Kappa: 0.58	Accuracy: 0.767

[sadness] Scores:
 AUC:  0.764	Kappa: 0.396	Accuracy: 0.651

[sadness] Scores:
 AUC:  0.727	Kappa: 0.452	Accuracy: 0.703

[sadness] Scores:
 AUC:  0.768	Kappa: 0.514	Accuracy: 0.733

[sadness] Scores:
 AUC:  0.754	Kappa: 0.456	Accuracy: 0.68

[sadness] Scores:
 AUC:  0.7

[anger] Scores:
 AUC:  0.788	Kappa: 0.485	Accuracy: 0.762

[anger] Scores:
 AUC:  0.793	Kappa: 0.534	Accuracy: 0.82

[anger] Scores:
 AUC:  0.793	Kappa: 0.509	Accuracy: 0.794

[anger] Scores:
 AUC:  0.801	Kappa: 0.441	Accuracy: 0.746

[anger] Scores:
 AUC:  0.754	Kappa: 0.511	Accuracy: 0.815

[anger] Scores:
 AUC:  0.77	Kappa: 0.426	Accuracy: 0.72

[anger] Scores:
 AUC:  0.824	Kappa: 0.518	Accuracy: 0.778

[anger] Scores:
 AUC:  0.765	Kappa: 0.446	Accuracy: 0.746

[anger] Scores:
 AUC:  0.795	Kappa: 0.473	Accuracy: 0.741

[anger] Scores:
 AUC:  0.776	Kappa: 0.528	Accuracy: 0.804

[anger] Scores:
 AUC:  0.807	Kappa: 0.48	Accuracy: 0.804

[anger] Scores:
 AUC:  0.795	Kappa: 0.421	Accuracy: 0.73

[anger] Scores:
 AUC:  0.813	Kappa: 0.555	Accuracy: 0.783

[anger] Scores:
 AUC:  0.812	Kappa: 0.45	Accuracy: 0.799

[anger] Scores:
 AUC:  0.747	Kappa: 0.409	Accuracy: 0.751

[anger] Scores:
 AUC:  0.81	Kappa: 0.491	Accuracy: 0.772

[anger] Scores:
 AUC:  0.776	Kappa: 0.528	Accuracy: 0.788

[ang

[fear] Scores:
 AUC:  0.763	Kappa: 0.372	Accuracy: 0.683

[fear] Scores:
 AUC:  0.787	Kappa: 0.413	Accuracy: 0.679

[fear] Scores:
 AUC:  0.808	Kappa: 0.506	Accuracy: 0.754

[fear] Scores:
 AUC:  0.792	Kappa: 0.482	Accuracy: 0.726

[fear] Scores:
 AUC:  0.802	Kappa: 0.486	Accuracy: 0.722

[fear] Scores:
 AUC:  0.787	Kappa: 0.424	Accuracy: 0.663

[fear] Scores:
 AUC:  0.74	Kappa: 0.447	Accuracy: 0.714

[fear] Scores:
 AUC:  0.778	Kappa: 0.493	Accuracy: 0.742

[fear] Scores:
 AUC:  0.742	Kappa: 0.458	Accuracy: 0.722

[fear] Scores:
 AUC:  0.781	Kappa: 0.428	Accuracy: 0.702

[fear] Scores:
 AUC:  0.815	Kappa: 0.514	Accuracy: 0.734

[fear] Scores:
 AUC:  0.795	Kappa: 0.519	Accuracy: 0.75

[fear] Scores:
 AUC:  0.778	Kappa: 0.452	Accuracy: 0.694

[fear] Scores:
 AUC:  0.802	Kappa: 0.496	Accuracy: 0.734

[joy] Scores:
 AUC:  0.853	Kappa: 0.513	Accuracy: 0.707

[joy] Scores:
 AUC:  0.866	Kappa: 0.607	Accuracy: 0.779

[joy] Scores:
 AUC:  0.841	Kappa: 0.523	Accuracy: 0.746

[joy] Scores:
 AUC:

[sadness] Scores:
 AUC:  0.753	Kappa: 0.402	Accuracy: 0.663

[sadness] Scores:
 AUC:  0.801	Kappa: 0.504	Accuracy: 0.709

[sadness] Scores:
 AUC:  0.777	Kappa: 0.469	Accuracy: 0.698

[sadness] Scores:
 AUC:  0.778	Kappa: 0.439	Accuracy: 0.68

[sadness] Scores:
 AUC:  0.754	Kappa: 0.378	Accuracy: 0.651

[sadness] Scores:
 AUC:  0.77	Kappa: 0.436	Accuracy: 0.698

[sadness] Scores:
 AUC:  0.781	Kappa: 0.383	Accuracy: 0.674

[sadness] Scores:
 AUC:  0.768	Kappa: 0.399	Accuracy: 0.651

[sadness] Scores:
 AUC:  0.75	Kappa: 0.409	Accuracy: 0.651

[sadness] Scores:
 AUC:  0.762	Kappa: 0.464	Accuracy: 0.692

[sadness] Scores:
 AUC:  0.767	Kappa: 0.5	Accuracy: 0.703

[sadness] Scores:
 AUC:  0.773	Kappa: 0.449	Accuracy: 0.68

[sadness] Scores:
 AUC:  0.799	Kappa: 0.5	Accuracy: 0.703

[sadness] Scores:
 AUC:  0.797	Kappa: 0.479	Accuracy: 0.715

[sadness] Scores:
 AUC:  0.794	Kappa: 0.528	Accuracy: 0.727

[sadness] Scores:
 AUC:  0.811	Kappa: 0.517	Accuracy: 0.709

[sadness] Scores:
 AUC:  0.796	K

In [27]:
# pickle.dump(summary, open("summary_v1.pickle", "wb"))

In [32]:
summary = pickle.load(open("summary_v4_1.pickle", "rb"))
pd.DataFrame({l: {sen: round(np.mean(summary[sen][l]), 3) for sen in sentiments} for l in metrics})#.set_index("sen")

Unnamed: 0,auc,kappa,accuracy
anger,0.789,0.466,0.762
fear,0.792,0.473,0.721
joy,0.836,0.522,0.73
sadness,0.77,0.448,0.683


In [25]:
summary = pickle.load(open("summary_v2.pickle", "rb"))
pd.DataFrame({l: {sen: round(np.mean(summary[sen][l]), 3) for sen in sentiments} for l in metrics})#.set_index("sen")

Unnamed: 0,auc,kappa,accuracy
anger,0.783,0.36,0.712
fear,0.8,0.412,0.673
joy,0.811,0.438,0.674
sadness,0.79,0.38,0.64


In [18]:
summary = pickle.load(open("summary_v3.pickle", "rb"))
pd.DataFrame({l: {sen: round(np.mean(summary[sen][l]), 3) for sen in sentiments} for l in metrics})#.set_index("sen")

Unnamed: 0,auc,kappa,accuracy
anger,0.769,0.427,0.751
fear,0.782,0.43,0.687
joy,0.814,0.485,0.71
sadness,0.756,0.387,0.651


In [21]:
summary = pickle.load(open("summary_v4.pickle", "rb"))
pd.DataFrame({l: {sen: round(np.mean(summary[sen][l]), 3) for sen in sentiments} for l in metrics})#.set_index("sen")

Unnamed: 0,auc,kappa,accuracy
anger,0.769,0.419,0.75
fear,0.788,0.432,0.686
joy,0.815,0.481,0.704
sadness,0.753,0.379,0.647


In [95]:
summary_baseline

Unnamed: 0_level_0,auc,kappa,accuracy
sen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
anger,0.62,0.07,0.63
fear,0.67,0.15,0.57
joy,0.65,0.18,0.54
sadness,0.67,0.19,0.55


In [39]:
df_x = pd.DataFrame([[1, 0], [0, 1]])
df_x["label"] = [1, 0]
df_x

Unnamed: 0,0,1,label
0,1,0,1
1,0,1,0


In [45]:
clf_x = make_pipeline(StandardScaler(), SVC(kernel="rbf", gamma='auto', class_weight="balanced", probability=True))
clf_x.fit(df_x.drop(columns=["label"]), df_x["label"])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc',
                 SVC(class_weight='balanced', gamma='auto', probability=True))])

In [47]:
clf_x.predict(df_x.drop(columns=["label"])[[0, 1]])

array([1, 0], dtype=int64)