#### Giovanni Gamaliel López Padilla
#### Procesamiento de lenguaje natural
#### Tarea 02

In [47]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support, roc_auc_score
from sklearn.model_selection import GridSearchCV
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import TweetTokenizer
from collections import Counter
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import svm
import numpy as np
import nltk
import os
import re

In [59]:
def join_path(path: str, filename: str) -> str:
    """
    Une la direccion de un archivo con su nombre
    """
    return "{}{}".format(path, filename)


def get_texts_from_file(path_data: str, path_labels: str) -> tuple:
    """
    Obtiene una lista de oraciones a partir de un texto con sus respectivas etiquetas
    """
    # Inicilizacion de las listas
    text = []
    labels = []
    # Apertura de los archivos
    with open(path_data, "r") as f_data, open(path_labels, "r") as f_labels:
        # Recoleccion de las oraciones
        for tweet in f_data:
            text += [tweet]
        # Recoleccion de las etiquedas
        for label in f_labels:
            labels += [label]
    # Etiquedas a enteros
    labels = list(map(int, labels))
    return text, labels


def sort_freqdist(fdist: nltk.FreqDist) -> list:
    """
    Ordena la lista de distribucion de frecuencias de palabras de mayor frecuencia a menor
    """
    aux = [(fdist[key], key) for key in fdist]
    aux.sort()
    aux.reverse()
    return aux


def split_data(data: list, max_words: int) -> list:
    """
    Realiza la separacion de elementos en una lista dado el numero de elementos que se quieren conservar
    """
    return data[:max_words]


def obtain_fdist(data: list, max_words: int) -> list:
    """
    Obtiene la lista de una distribucion de frecuencias de palabras ordenada de mayor a menor a partir de una lista de oraciones
    """
    # Inicializacion del Tokenizador
    tokenizer = TweetTokenizer()
    # Inicializacion de la lista que guardara los tokens
    corpus_palabras = []
    for tweet in data:
        # Creacion y guardado de los tokens
        corpus_palabras += tokenizer.tokenize(tweet)
    # Creacion de la distribucion de frecuencias
    fdist = nltk.FreqDist(corpus_palabras)
    fdist = sort_freqdist(fdist)
    fdist = split_data(fdist, max_words)
    return fdist


def create_dictonary_of_index(fdist: list) -> dict:
    """
    Crea un diccionario con la posición de mayor a menor frecuencia de cada palabra. La llave es la palabra a consultar
    """
    # Inicializacion del diccionario
    index = dict()
    # Inicializacion de la posicion
    i = 0
    for weight, word in fdist:
        index[word] = i
        i += 1
    return index


def build_binary_bow(data: list, fdist: list, index: dict) -> np.array:
    """
    Creacion de la BoW usando pesos binarios
    """
    tokenizer = TweetTokenizer()
    bow = np.zeros((len(data), len(fdist)), dtype=float)
    docs = 0
    for tweet in data:
        fdist_data = nltk.FreqDist(tokenizer.tokenize(tweet))
        for word in fdist_data:
            if word in index.keys():
                bow[docs, index[word]] = 1
        docs += 1
    return bow


def build_binary_bow_with_probabilities(data: list, fdist: list, index: dict,
                                        probability: dict) -> np.array:
    """
    Creacion de la BoW usando pesos binarios
    """
    tokenizer = TweetTokenizer()
    bow = np.zeros((len(data), len(fdist)), dtype=float)
    docs = 0
    for tweet in data:
        fdist_data = nltk.FreqDist(tokenizer.tokenize(tweet))
        for word in fdist_data:
            if word in index.keys():
                bow[docs, index[word]] = 1
                if word in probability:
                    bow[docs, index[word]] = probability[word]
        docs += 1
    return bow


def build_frecuency_bow(data: list, fdist: list, index: dict) -> np.array:
    """
    Creacion de la BoW usando pesos basado en frecuencias
    """
    tokenizer = TweetTokenizer()
    bow = np.zeros((len(data), len(fdist)), dtype=float)
    docs = 0
    for tweet in data:
        fdist_data = nltk.FreqDist(tokenizer.tokenize(tweet))
        for word in fdist_data:
            if word in index.keys():
                bow[docs, index[word]] = tweet.count(word)
        docs += 1
    return bow


def build_frecuency_bow_with_probabilities(data: list, fdist: list,
                                           index: dict,
                                           probability: dict) -> np.array:
    """
    Creacion de la BoW usando pesos basado en frecuencias
    """
    tokenizer = TweetTokenizer()
    bow = np.zeros((len(data), len(fdist)), dtype=float)
    docs = 0
    for tweet in data:
        fdist_data = nltk.FreqDist(tokenizer.tokenize(tweet))
        for word in fdist_data:
            if word in index.keys():
                bow[docs, index[word]] = tweet.count(word)
                if word in probability:
                    bow[docs, index[word]] *= probability[word]
        docs += 1
    return bow


def create_empty_dictionary_of_words_and_documents(words: dict,
                                                   data: list) -> dict:
    """
    Crea un diccionario el cual contendra de forma ordenada el indice de cada palabra y su numero de frecuencias en una coleccion
    """
    freq_word_per_document = dict()
    word_count = dict()
    for i, tweet in enumerate(data):
        word_count[i] = 0
    for word in words:
        freq_word_per_document[word] = word_count
    return freq_word_per_document


def build_tfidf_bow(data: list, fdist: list, index: dict) -> np.array:
    """
    Creacion de la BoW usando pesos basado en frecuencias
    """
    tokenizer = TweetTokenizer()
    # Inicilizacion del bow
    bow = np.zeros((len(data), len(fdist)), dtype=float)
    # Total de oraciones
    n = len(data)
    # Inicializacion del diccionario que contiene la repeticion de cada palabra
    idf_per_word_and_document = create_empty_dictionary_of_words_and_documents(
        index.keys(), data)
    for docs, tweet in enumerate(data):
        # Frecuencias
        fdist_data = nltk.FreqDist(tokenizer.tokenize(tweet))
        for word in fdist_data:
            if word in index.keys():
                # Descriptiva
                tf = tweet.count(word)
                idf_per_word_and_document[word][docs] += 1
                bow[docs, index[word]] = np.log(tf + 1)

    # Discriminativa
    for word in index.keys():
        idf = sum(idf_per_word_and_document[word].values())
        idf = np.log(n / idf)
        for docs, tweet in enumerate(data):
            bow[docs, index[word]] = bow[docs, index[word]] * idf
    return bow


def build_tfidf_bow_with_probabilities(data: list, fdist: list, index: dict,
                                       probability: dict) -> np.array:
    """
    Creacion de la BoW usando pesos basado en frecuencias
    """
    tokenizer = TweetTokenizer()
    # Inicilizacion del bow
    bow = np.zeros((len(data), len(fdist)), dtype=float)
    # Total de oraciones
    n = len(data)
    # Inicializacion del diccionario que contiene la repeticion de cada palabra
    idf_per_word_and_document = create_empty_dictionary_of_words_and_documents(
        index.keys(), data)
    for docs, tweet in enumerate(data):
        # Frecuencias
        fdist_data = nltk.FreqDist(tokenizer.tokenize(tweet))
        for word in fdist_data:
            if word in index.keys():
                # Descriptiva
                tf = tweet.count(word)
                idf_per_word_and_document[word][docs] += 1
                bow[docs, index[word]] = np.log(tf + 1)

    # Discriminativa
    for word in index.keys():
        idf = sum(idf_per_word_and_document[word].values())
        idf = np.log(n / idf)
        for docs, tweet in enumerate(data):
            if word in probability:
                bow[docs, index[word]] *= probability[word]
            bow[docs, index[word]] *= idf
    return bow


def create_model(bow_tr: np.array, labels_tr: np.array) -> GridSearchCV:
    """
    Creacion del modelo para realizar el aprendizaje
    """
    parameters_model = {"C": [0.05, 0.12, 0.25, 0.5, 1, 2, 4]}
    svr = svm.LinearSVC(class_weight="balanced", max_iter=1200000)
    grid = GridSearchCV(estimator=svr,
                        param_grid=parameters_model,
                        n_jobs=8,
                        scoring="f1_macro",
                        cv=5)
    grid.fit(bow_tr, labels_tr)
    return grid


def evaluate_model(bow_val: np.array, labels_val: np.array,
                   grid: GridSearchCV) -> np.array:
    """
    Resultados del modelo con el dataset de validacion
    """
    y_pred = grid.predict(bow_val)
    p, r, f, _ = precision_recall_fscore_support(labels_val,
                                                 y_pred,
                                                 average="macro",
                                                 pos_label=1)
    print(confusion_matrix(labels_val, y_pred))
    print(metrics.classification_report(labels_val, y_pred))
    return y_pred


def normalize(bow: np.array) -> np.array:
    """
    Normalizacion de la BoW de dos dimensiones
    """
    # Copia de la BoW
    bow_norm = bow.copy()
    for i in range(bow.shape[0]):
        # Inicializacion de la norma
        norm = 0
        # Calculo de la norma
        norm += sum([value**2 for value in bow[i]])
        norm = np.sqrt(norm)
        # Estandarizacion de la norma
        bow_norm[i] = np.array([value / norm for value in bow[i]])
    return bow_norm


def build_BoE_from_EmoLex(filename: str) -> dict:
    """
    Creacion de una bolsa de emociones a partir de la base de datos de EmoLex
    """
    with open(filename, "r", encoding='utf-8') as file:
        # Inicializacion de los diccionarios
        words_dict = dict()
        scores = dict()
        # Salto del header
        for i in range(1):
            next(file)
        for line in file:
            # Lectura de la informacion
            data = line.split('\t')
            if data[1] != 'NO TRANSLATION':
                # Obtencion del score
                score = float(data[3])
                word = data[1].lower()
                if not word in words_dict:
                    words_dict[word] = data[2]
                    scores[word] = score
                elif score > scores[word]:
                    words_dict[word] = data[2]
                    scores[word] = score
    return words_dict


def build_BoE_from_SEL(filename: str) -> tuple:
    """
    Creacion de una bolsa de emociones a partir de la base de datos de SEL
    """
    # Apertura del archivo
    with open(filename, "r", encoding='latin-1') as file:
        # Inicializacion de los diccionarios
        words_emotions = dict()
        scores = dict()
        # Salto del header
        for i in range(1):
            next(file)
        # Lectura del archivo
        for line in file:
            # Split de los datos
            data = line.split('\t')
            # Score
            score = float(data[1])
            # Palabra en minusculas
            word = data[0].lower()
            # Si no se ha guardado se guarda
            if not word in words_emotions:
                words_emotions[word] = data[2].replace("\n", "")
                scores[word] = score
            # Si ya existe se comprueba que sea el que contiene mayor score
            elif score > scores[word]:
                words_emotions[word] = data[2].replace("\n", "")
                scores[word] = score
    return words_emotions, scores


def mask_emotion(tokens: list, word_emotions: dict) -> list:
    """
    Enmascara un tweet a partir de las BoE dadas
    """
    token_copy = tokens.copy()
    for i, word in enumerate(tokens):
        if word in word_emotions:
            token_copy[i] = word_emotions[word]
    return token_copy


def obtain_corpus_emotions(document: list, word_emotions: dict) -> list:
    """
    Obtiene todo un corpus de emociones enmascarando cada tweet con la bolsa de emociones dada
    """
    tokenizer = TweetTokenizer()
    # Copia del corpus
    document_copy = document.copy()
    for i, tweet in enumerate(document):
        tweet = tokenizer.tokenize(tweet)
        emotions = mask_emotion(tweet, word_emotions)
        document_copy[i] = " ".join(emotions)
    return document_copy

In [36]:
parameters = {
    "path data": "../Data/",
    "train": {
        "data": "mex_train.txt",
        "labels": "mex_train_labels.txt"
    },
    "validation": {
        "data": "mex_val.txt",
        "labels": "mex_val_labels.txt"
    },
    "EmoLex": "emolex.txt",
    "SEL": "SEL.txt",
    "max words": 5000,
}
# Definicion de las rutas de cada archivo de datos y validacion
path_data_tr = join_path(parameters["path data"], parameters["train"]["data"])
path_label_tr = join_path(parameters["path data"],
                          parameters["train"]["labels"])
path_data_val = join_path(parameters["path data"],
                          parameters["validation"]["data"])
path_label_val = join_path(parameters["path data"],
                           parameters["validation"]["labels"])
# Lectura de las oraciones y etiquetas de los datos de entrenamiento y validacion
data_tr, labels_tr = get_texts_from_file(path_data_tr, path_label_tr)
data_val, labels_val = get_texts_from_file(path_data_val, path_label_val)

In [4]:
# Obtiene la distribucion de palabras ordenadas de mayor a menor con un maximo de 5000 palabras
fdist_tr = obtain_fdist(data_tr,
                        parameters["max words"])
# Creacion del diccionario con la posicion en la distribucion de palabras
word_index = create_dictonary_of_index(fdist_tr)

##### 2.1) Evalue Bow con pesos binarios

In [5]:
# Creacion de la BoW para los datos de entrenamiento usando pesos binarios
binary_bow_tr = build_binary_bow(data_tr,
                                 fdist_tr,
                                 word_index)
# Creacion de la BoW para los datos de validacion usando pesos binarios
binary_bow_val = build_binary_bow(data_val,
                                  fdist_tr,
                                  word_index)
grid = create_model(binary_bow_tr, labels_tr)
y_pred = evaluate_model(binary_bow_val, labels_val, grid)

[[329  68]
 [ 47 172]]
              precision    recall  f1-score   support

           0       0.88      0.83      0.85       397
           1       0.72      0.79      0.75       219

    accuracy                           0.81       616
   macro avg       0.80      0.81      0.80       616
weighted avg       0.82      0.81      0.82       616



##### 2.2) Evalue Bow con pesado frecuencia

In [6]:
# Creacion de la BoW para los datos de entrenamiento usando pesos binarios
freq_bow_tr = build_frecuency_bow(data_tr, fdist_tr, word_index)
# Creacion de la BoW para los datos de validacion usando pesos binarios
freq_bow_val = build_frecuency_bow(data_val, fdist_tr, word_index)
grid = create_model(freq_bow_tr, labels_tr)
y_pred = evaluate_model(freq_bow_val, labels_val, grid)

[[333  64]
 [ 49 170]]
              precision    recall  f1-score   support

           0       0.87      0.84      0.85       397
           1       0.73      0.78      0.75       219

    accuracy                           0.82       616
   macro avg       0.80      0.81      0.80       616
weighted avg       0.82      0.82      0.82       616



##### 2.3) Evalue Bow con pesado tfidf

In [7]:
# Creacion de la BoW para los datos de entrenamiento usando pesos binarios
tfidf_bow_tr = build_tfidf_bow(data_tr, fdist_tr, word_index)
# Creacion de la BoW para los datos de validacion usando pesos binarios
tfidf_bow_val = build_tfidf_bow(data_val, fdist_tr, word_index)
grid = create_model(tfidf_bow_tr, labels_tr)
y_pred = evaluate_model(tfidf_bow_val, labels_val, grid)

[[329  68]
 [ 57 162]]
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       397
           1       0.70      0.74      0.72       219

    accuracy                           0.80       616
   macro avg       0.78      0.78      0.78       616
weighted avg       0.80      0.80      0.80       616



##### 2.4) Evalue Bow con pesos binarios normalizado

In [8]:
# Normalizacion de la BoW binaria
binary_bow_tr_norm = normalize(binary_bow_tr)
binary_bow_val_norm = normalize(binary_bow_val)
grid = create_model(binary_bow_tr_norm, labels_tr)
y_pred = evaluate_model(binary_bow_val_norm, labels_val, grid)

[[322  75]
 [ 49 170]]
              precision    recall  f1-score   support

           0       0.87      0.81      0.84       397
           1       0.69      0.78      0.73       219

    accuracy                           0.80       616
   macro avg       0.78      0.79      0.79       616
weighted avg       0.81      0.80      0.80       616



##### 2.5) Evalue Bow con pesado frecuencia normalizado

In [9]:
# Normalizacion de la BoW basada en frecuencias
freq_bow_tr_norm = normalize(freq_bow_tr)
freq_bow_val_norm = normalize(freq_bow_val)
grid = create_model(freq_bow_tr_norm, labels_tr)
y_pred = evaluate_model(freq_bow_val_norm, labels_val, grid)

[[321  76]
 [ 49 170]]
              precision    recall  f1-score   support

           0       0.87      0.81      0.84       397
           1       0.69      0.78      0.73       219

    accuracy                           0.80       616
   macro avg       0.78      0.79      0.78       616
weighted avg       0.80      0.80      0.80       616



##### 2.6) Evalue Bow con pesado tfidf normalizado

In [10]:
# Normalizacion de la BoW basada en tfidf
tfidf_bow_tr_norm = normalize(tfidf_bow_tr)
tfidf_bow_val_norm = normalize(tfidf_bow_val)
grid = create_model(tfidf_bow_tr_norm, labels_tr)
y_pred = evaluate_model(tfidf_bow_val_norm, labels_val, grid)

[[325  72]
 [ 50 169]]
              precision    recall  f1-score   support

           0       0.87      0.82      0.84       397
           1       0.70      0.77      0.73       219

    accuracy                           0.80       616
   macro avg       0.78      0.80      0.79       616
weighted avg       0.81      0.80      0.80       616



##### 2.7) Ponga una tabla comparativa a modo de resumen con las seis entradas anteriores

<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-0lax{text-align:left;vertical-align:top}
</style>
<table class="tg">
<thead>
  <tr>
    <th class="tg-0lax" rowspan="2">Método</th>
    <th class="tg-0lax" colspan="2">Precision</th>
    <th class="tg-0lax" colspan="2">Recall</th>
    <th class="tg-0lax" colspan="2">F1-score</th>
  </tr>
  <tr>
    <th class="tg-0lax">0</th>
    <th class="tg-0lax">1</th>
    <th class="tg-0lax">0</th>
    <th class="tg-0lax">1</th>
    <th class="tg-0lax">0</th>
    <th class="tg-0lax">1</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-0lax">Binario</td>
    <td class="tg-0lax">0.88</td>
    <td class="tg-0lax">0.72</td>
    <td class="tg-0lax">0.83</td>
    <td class="tg-0lax">0.79</td>
    <td class="tg-0lax">0.85</td>
    <td class="tg-0lax">0.75</td>
  </tr>
  <tr>
    <td class="tg-0lax">Binario <br>normalizado</td>
    <td class="tg-0lax">0.87</td>
    <td class="tg-0lax">0.69</td>
    <td class="tg-0lax">0.81</td>
    <td class="tg-0lax">0.78</td>
    <td class="tg-0lax">0.84</td>
    <td class="tg-0lax">0.73</td>
  </tr>
  <tr>
    <td class="tg-0lax">Frecuencias</td>
    <td class="tg-0lax">0.87</td>
    <td class="tg-0lax">0.73</td>
    <td class="tg-0lax">0.84</td>
    <td class="tg-0lax">0.78</td>
    <td class="tg-0lax">0.85</td>
    <td class="tg-0lax">0.75</td>
  </tr>
  <tr>
    <td class="tg-0lax">Frecuencias<br>normalizado</td>
    <td class="tg-0lax">0.87</td>
    <td class="tg-0lax">0.69</td>
    <td class="tg-0lax">0.81</td>
    <td class="tg-0lax">0.78</td>
    <td class="tg-0lax">0.84</td>
    <td class="tg-0lax">0.73</td>
  </tr>
  <tr>
    <td class="tg-0lax">TFIDF</td>
    <td class="tg-0lax">0.85</td>
    <td class="tg-0lax">0.70</td>
    <td class="tg-0lax">0.83</td>
    <td class="tg-0lax">0.74</td>
    <td class="tg-0lax">0.84</td>
    <td class="tg-0lax">0.72</td>
  </tr>
  <tr>
    <td class="tg-0lax">TFIDF<br>normalizado</td>
    <td class="tg-0lax">0.87</td>
    <td class="tg-0lax">0.70</td>
    <td class="tg-0lax">0.80</td>
    <td class="tg-0lax">0.80</td>
    <td class="tg-0lax">0.84</td>
    <td class="tg-0lax">0.73</td>
  </tr>
</tbody>
</table>

#### 2.8) De las configuraciones anteriores elija la mejor y evalúela con más y menos términos (e.g., 1000 y 7000). Ponga una tabla dónde compare las tres configuraciones.

##### 1000 terminos

In [11]:
parameters["max words"] = 1000
# Obtiene la distribucion de palabras ordenadas de mayor a menor con un maximo de 5000 palabras
fdist_tr = obtain_fdist(data_tr,
                        parameters["max words"])
# Creacion del diccionario con la posicion en la distribucion de palabras
word_index = create_dictonary_of_index(fdist_tr)

###### Binario

In [12]:
# Creacion de la BoW para los datos de entrenamiento usando pesos binarios
binary_bow_tr = build_frecuency_bow(data_tr, fdist_tr, word_index)
# Creacion de la BoW para los datos de validacion usando pesos binarios
binary_bow_val = build_frecuency_bow(data_val, fdist_tr, word_index)
grid = create_model(binary_bow_tr, labels_tr)
y_pred = evaluate_model(binary_bow_val, labels_val, grid)

[[331  66]
 [ 49 170]]
              precision    recall  f1-score   support

           0       0.87      0.83      0.85       397
           1       0.72      0.78      0.75       219

    accuracy                           0.81       616
   macro avg       0.80      0.81      0.80       616
weighted avg       0.82      0.81      0.81       616



###### Frecuencias

In [13]:
# Creacion de la BoW para los datos de entrenamiento usando pesos binarios
freq_bow_tr = build_frecuency_bow(data_tr, fdist_tr, word_index)
# Creacion de la BoW para los datos de validacion usando pesos binarios
freq_bow_val = build_frecuency_bow(data_val, fdist_tr, word_index)
grid = create_model(freq_bow_tr, labels_tr)
y_pred = evaluate_model(freq_bow_val, labels_val, grid)

[[331  66]
 [ 49 170]]
              precision    recall  f1-score   support

           0       0.87      0.83      0.85       397
           1       0.72      0.78      0.75       219

    accuracy                           0.81       616
   macro avg       0.80      0.81      0.80       616
weighted avg       0.82      0.81      0.81       616



##### 7000 terminos

In [14]:
parameters["max words"] = 7000
# Obtiene la distribucion de palabras ordenadas de mayor a menor con un maximo de 5000 palabras
fdist_tr = obtain_fdist(data_tr,
                        parameters["max words"])
# Creacion del diccionario con la posicion en la distribucion de palabras
word_index = create_dictonary_of_index(fdist_tr)

###### Binario

In [15]:
# Creacion de la BoW para los datos de entrenamiento usando pesos binarios
binary_bow_tr = build_frecuency_bow(data_tr, fdist_tr, word_index)
# Creacion de la BoW para los datos de validacion usando pesos binarios
binary_bow_val = build_frecuency_bow(data_val, fdist_tr, word_index)
grid = create_model(binary_bow_tr, labels_tr)
y_pred = evaluate_model(binary_bow_val, labels_val, grid)

[[334  63]
 [ 50 169]]
              precision    recall  f1-score   support

           0       0.87      0.84      0.86       397
           1       0.73      0.77      0.75       219

    accuracy                           0.82       616
   macro avg       0.80      0.81      0.80       616
weighted avg       0.82      0.82      0.82       616



###### Frecuencias

In [16]:
# Creacion de la BoW para los datos de entrenamiento usando pesos binarios
freq_bow_tr = build_frecuency_bow(data_tr, fdist_tr, word_index)
# Creacion de la BoW para los datos de validacion usando pesos binarios
freq_bow_val = build_frecuency_bow(data_val, fdist_tr, word_index)
grid = create_model(freq_bow_tr, labels_tr)
y_pred = evaluate_model(freq_bow_val, labels_val, grid)

[[334  63]
 [ 50 169]]
              precision    recall  f1-score   support

           0       0.87      0.84      0.86       397
           1       0.73      0.77      0.75       219

    accuracy                           0.82       616
   macro avg       0.80      0.81      0.80       616
weighted avg       0.82      0.82      0.82       616



<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-c3ow{border-color:inherit;text-align:center;vertical-align:top}
.tg .tg-0pky{border-color:inherit;text-align:left;vertical-align:top}
</style>
<table class="tg">
<thead>
  <tr>
    <th class="tg-c3ow" rowspan="2">Método</th>
    <th class="tg-0pky" rowspan="2">Términos</th>
    <th class="tg-c3ow" colspan="2">Precision</th>
    <th class="tg-c3ow" colspan="2">Recall</th>
    <th class="tg-c3ow" colspan="2">F1-score</th>
  </tr>
  <tr>
    <th class="tg-0pky">0</th>
    <th class="tg-0pky">1</th>
    <th class="tg-0pky">0</th>
    <th class="tg-0pky">1</th>
    <th class="tg-0pky">0</th>
    <th class="tg-0pky">1</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-0pky" rowspan="3">Binario</td>
    <td class="tg-0pky">1000</td>
    <td class="tg-0pky">0.87</td>
    <td class="tg-0pky">0.72</td>
    <td class="tg-0pky">0.83</td>
    <td class="tg-0pky">0.78</td>
    <td class="tg-0pky">0.85</td>
    <td class="tg-0pky">0.75</td>
  </tr>
  <tr>
    <td class="tg-0pky">5000</td>
    <td class="tg-0pky">0.88</td>
    <td class="tg-0pky">0.72</td>
    <td class="tg-0pky">0.83</td>
    <td class="tg-0pky">0.79</td>
    <td class="tg-0pky">0.85</td>
    <td class="tg-0pky">0.75</td>
  </tr>
  <tr>
    <td class="tg-0pky">7000</td>
    <td class="tg-0pky">0.87</td>
    <td class="tg-0pky">0.73</td>
    <td class="tg-0pky">0.84</td>
    <td class="tg-0pky">0.77</td>
    <td class="tg-0pky">0.86</td>
    <td class="tg-0pky">0.75</td>
  </tr>
  <tr>
    <td class="tg-0pky" rowspan="3">Frecuencias</td>
    <td class="tg-0pky">1000</td>
    <td class="tg-0pky">0.87</td>
    <td class="tg-0pky">0.72</td>
    <td class="tg-0pky">0.83</td>
    <td class="tg-0pky">0.78</td>
    <td class="tg-0pky">0.85</td>
    <td class="tg-0pky">0.75</td>
  </tr>
  <tr>
    <td class="tg-0pky">5000</td>
    <td class="tg-0pky">0.87</td>
    <td class="tg-0pky">0.73</td>
    <td class="tg-0pky">0.84</td>
    <td class="tg-0pky">0.78</td>
    <td class="tg-0pky">0.85</td>
    <td class="tg-0pky">0.75</td>
  </tr>
  <tr>
    <td class="tg-0pky">7000</td>
    <td class="tg-0pky">0.87</td>
    <td class="tg-0pky">0.73</td>
    <td class="tg-0pky">0.84</td>
    <td class="tg-0pky">0.77</td>
    <td class="tg-0pky">0.86</td>
    <td class="tg-0pky">0.75</td>
  </tr>
</tbody>
</table>

##### 2.9) Utilice el recurso léxico del Consejo Nacional de Investigación de Canadá llamado "EmoLex" (https://www.saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm) para construir una "Bolsa de Emociones" de los Tweets de agresividad (Debe usar EmoLex en Español). Para esto, una estrategia sencilla sería enmascarar cada palabra con su emoción, y después construir la Bolsa de Emociones (BoE).

In [25]:
parameters["max words"] = 5000
emolex_path = join_path(parameters["path data"], parameters["EmoLex"])
words_emotions = build_BoE_from_EmoLex(emolex_path)
data_tr_emotions = obtain_corpus_emotions(data_tr, words_emotions)
data_val_emotions = obtain_corpus_emotions(data_val, words_emotions)
# Obtiene la distribucion de palabras ordenadas de mayor a menor con un maximo de 5000 palabras
fdist_tr = obtain_fdist(data_tr_emotions, parameters["max words"])
# Creacion del diccionario con la posicion en la distribucion de palabras
word_index = create_dictonary_of_index(fdist_tr)

##### 2.10) Evalúa tú BoE clasificando con SVM. Ponga una tabla comparativa a modo de resumen con los tres pesados, normalize cada uno si lo cree conveniente.

###### Binario

In [27]:
# Creacion de la BoW para los datos de entrenamiento usando pesos binarios
binary_bow_tr = build_binary_bow(data_tr_emotions, fdist_tr, word_index)
# Creacion de la BoW para los datos de validacion usando pesos binarios
binary_bow_val = build_binary_bow(data_val_emotions, fdist_tr, word_index)
grid = create_model(binary_bow_tr, labels_tr)
y_pred = evaluate_model(binary_bow_val, labels_val, grid)

[[330  67]
 [ 56 163]]
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       397
           1       0.71      0.74      0.73       219

    accuracy                           0.80       616
   macro avg       0.78      0.79      0.78       616
weighted avg       0.80      0.80      0.80       616



###### Frecuencias

In [30]:
# Creacion de la BoW para los datos de entrenamiento usando pesos binarios
freq_bow_tr = build_frecuency_bow(data_tr_emotions, fdist_tr, word_index)
# Creacion de la BoW para los datos de validacion usando pesos binarios
freq_bow_val = build_frecuency_bow(data_val_emotions, fdist_tr, word_index)
grid = create_model(freq_bow_tr, labels_tr)
y_pred = evaluate_model(freq_bow_val, labels_val, grid)

[[333  64]
 [ 59 160]]
              precision    recall  f1-score   support

           0       0.85      0.84      0.84       397
           1       0.71      0.73      0.72       219

    accuracy                           0.80       616
   macro avg       0.78      0.78      0.78       616
weighted avg       0.80      0.80      0.80       616



###### TFIDF

In [31]:
# Creacion de la BoW para los datos de entrenamiento usando pesos binarios
tfidf_bow_tr = build_tfidf_bow(data_tr, fdist_tr, word_index)
# Creacion de la BoW para los datos de validacion usando pesos binarios
tfidf_bow_val = build_tfidf_bow(data_val, fdist_tr, word_index)
grid = create_model(tfidf_bow_tr, labels_tr)
y_pred = evaluate_model(tfidf_bow_val, labels_val, grid)

[[328  69]
 [ 59 160]]
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       397
           1       0.70      0.73      0.71       219

    accuracy                           0.79       616
   macro avg       0.77      0.78      0.78       616
weighted avg       0.79      0.79      0.79       616



###### Binario normalizado

In [32]:
# Normalizacion de la BoW binaria
binary_bow_tr_norm = normalize(binary_bow_tr)
binary_bow_val_norm = normalize(binary_bow_val)
grid = create_model(binary_bow_tr_norm, labels_tr)
y_pred = evaluate_model(binary_bow_val_norm, labels_val, grid)

[[323  74]
 [ 57 162]]
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       397
           1       0.69      0.74      0.71       219

    accuracy                           0.79       616
   macro avg       0.77      0.78      0.77       616
weighted avg       0.79      0.79      0.79       616



###### Frecuencias

In [33]:
# Normalizacion de la BoW basada en frecuencias
freq_bow_tr_norm = normalize(freq_bow_tr)
freq_bow_val_norm = normalize(freq_bow_val)
grid = create_model(freq_bow_tr_norm, labels_tr)
y_pred = evaluate_model(freq_bow_val_norm, labels_val, grid)

[[320  77]
 [ 52 167]]
              precision    recall  f1-score   support

           0       0.86      0.81      0.83       397
           1       0.68      0.76      0.72       219

    accuracy                           0.79       616
   macro avg       0.77      0.78      0.78       616
weighted avg       0.80      0.79      0.79       616



###### TFIDF

In [34]:
# Normalizacion de la BoW basada en tfidf
tfidf_bow_tr_norm = normalize(tfidf_bow_tr)
tfidf_bow_val_norm = normalize(tfidf_bow_val)
grid = create_model(tfidf_bow_tr_norm, labels_tr)
y_pred = evaluate_model(tfidf_bow_val_norm, labels_val, grid)

[[328  69]
 [ 52 167]]
              precision    recall  f1-score   support

           0       0.86      0.83      0.84       397
           1       0.71      0.76      0.73       219

    accuracy                           0.80       616
   macro avg       0.79      0.79      0.79       616
weighted avg       0.81      0.80      0.81       616



###### Cuadro comparativo

<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-0lax{text-align:left;vertical-align:top}
</style>
<table class="tg">
<thead>
  <tr>
    <th class="tg-0lax" rowspan="2">Método</th>
    <th class="tg-0lax" colspan="2">Precision</th>
    <th class="tg-0lax" colspan="2">Recall</th>
    <th class="tg-0lax" colspan="2">F1-score</th>
  </tr>
  <tr>
    <th class="tg-0lax">0</th>
    <th class="tg-0lax">1</th>
    <th class="tg-0lax">0</th>
    <th class="tg-0lax">1</th>
    <th class="tg-0lax">0</th>
    <th class="tg-0lax">1</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-0lax">Binario</td>
    <td class="tg-0lax">0.85</td>
    <td class="tg-0lax">0.71</td>
    <td class="tg-0lax">0.83</td>
    <td class="tg-0lax">0.74</td>
    <td class="tg-0lax">0.84</td>
    <td class="tg-0lax">0.73</td>
  </tr>
  <tr>
    <td class="tg-0lax">Binario <br>normalizado</td>
    <td class="tg-0lax">0.85</td>
    <td class="tg-0lax">0.69</td>
    <td class="tg-0lax">0.81</td>
    <td class="tg-0lax">0.74</td>
    <td class="tg-0lax">0.83</td>
    <td class="tg-0lax">0.71</td>
  </tr>
  <tr>
    <td class="tg-0lax">Frecuencias</td>
    <td class="tg-0lax">0.85</td>
    <td class="tg-0lax">0.71</td>
    <td class="tg-0lax">0.84</td>
    <td class="tg-0lax">0.73</td>
    <td class="tg-0lax">0.84</td>
    <td class="tg-0lax">0.72</td>
  </tr>
  <tr>
    <td class="tg-0lax">Frecuencias<br>normalizado</td>
    <td class="tg-0lax">0.86</td>
    <td class="tg-0lax">0.68</td>
    <td class="tg-0lax">0.81</td>
    <td class="tg-0lax">0.76</td>
    <td class="tg-0lax">0.83</td>
    <td class="tg-0lax">0.72</td>
  </tr>
  <tr>
    <td class="tg-0lax">TFIDF</td>
    <td class="tg-0lax">0.85</td>
    <td class="tg-0lax">0.70</td>
    <td class="tg-0lax">0.83</td>
    <td class="tg-0lax">0.73</td>
    <td class="tg-0lax">0.84</td>
    <td class="tg-0lax">0.71</td>
  </tr>
  <tr>
    <td class="tg-0lax">TFIDF<br>normalizado</td>
    <td class="tg-0lax">0.86</td>
    <td class="tg-0lax">0.71</td>
    <td class="tg-0lax">0.83</td>
    <td class="tg-0lax">0.76</td>
    <td class="tg-0lax">0.84</td>
    <td class="tg-0lax">0.73</td>
  </tr>
</tbody>
</table>

#### 3.0) Utilice el recurso léxico llamado "Spanish Emotion Lexicon (SEL)" del Dr. Grigori Sidorov, profesor del Centro de Investigación en Computación (CIC) del Instituto Politécnico Nacional (http://www.cic.ipn.mx/∼sidorov/), para enmascarar cada palabra con su emo- ción, y después construir la Bolsa de Emociones con algún pesado (e.g., binario, tf, tfidf). Proponga alguna estrategia para incorporar el "valor" del "Probability Factor of Affective use" en su representación vectorial del documento

In [53]:
sel_path = join_path(parameters["path data"], parameters["SEL"])
words_emotions,scores = build_BoE_from_SEL(sel_path)
data_tr_emotions = obtain_corpus_emotions(data_tr, words_emotions)
data_val_emotions = obtain_corpus_emotions(data_val, words_emotions)
# Obtiene la distribucion de palabras ordenadas de mayor a menor con un maximo de 5000 palabras
fdist_tr = obtain_fdist(data_tr_emotions, parameters["max words"])
# Creacion del diccionario con la posicion en la distribucion de palabras
word_index = create_dictonary_of_index(fdist_tr)

In [60]:
# Creacion de la BoW para los datos de entrenamiento usando pesos binarios
binary_bow_tr = build_binary_bow_with_probabilities(data_tr_emotions, fdist_tr,
                                                    word_index, scores)
# Creacion de la BoW para los datos de validacion usando pesos binarios
binary_bow_val = build_binary_bow_with_probabilities(data_val_emotions,
                                                     fdist_tr, word_index,
                                                     scores)
grid = create_model(binary_bow_tr, labels_tr)
y_pred = evaluate_model(binary_bow_val, labels_val, grid)

[[330  67]
 [ 49 170]]
              precision    recall  f1-score   support

           0       0.87      0.83      0.85       397
           1       0.72      0.78      0.75       219

    accuracy                           0.81       616
   macro avg       0.79      0.80      0.80       616
weighted avg       0.82      0.81      0.81       616



In [61]:
# Creacion de la BoW para los datos de entrenamiento usando pesos binarios
freq_bow_tr = build_frecuency_bow_with_probabilities(data_tr_emotions,
                                                     fdist_tr, word_index,
                                                     scores)
# Creacion de la BoW para los datos de validacion usando pesos binarios
freq_bow_val = build_frecuency_bow_with_probabilities(data_val_emotions,
                                                      fdist_tr, word_index,
                                                      scores)
grid = create_model(freq_bow_tr, labels_tr)
y_pred = evaluate_model(freq_bow_val, labels_val, grid)

[[332  65]
 [ 50 169]]
              precision    recall  f1-score   support

           0       0.87      0.84      0.85       397
           1       0.72      0.77      0.75       219

    accuracy                           0.81       616
   macro avg       0.80      0.80      0.80       616
weighted avg       0.82      0.81      0.81       616



In [62]:
# Creacion de la BoW para los datos de entrenamiento usando pesos binarios
tfidf_bow_tr = build_tfidf_bow_with_probabilities(data_tr, fdist_tr,
                                                  word_index, scores)
# Creacion de la BoW para los datos de validacion usando pesos binarios
tfidf_bow_val = build_tfidf_bow_with_probabilities(data_val, fdist_tr,
                                                   word_index, scores)
grid = create_model(tfidf_bow_tr, labels_tr)
y_pred = evaluate_model(tfidf_bow_val, labels_val, grid)

[[329  68]
 [ 59 160]]
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       397
           1       0.70      0.73      0.72       219

    accuracy                           0.79       616
   macro avg       0.77      0.78      0.78       616
weighted avg       0.80      0.79      0.79       616



In [63]:
# Normalizacion de la BoW binaria
binary_bow_tr_norm = normalize(binary_bow_tr)
binary_bow_val_norm = normalize(binary_bow_val)
grid = create_model(binary_bow_tr_norm, labels_tr)
y_pred = evaluate_model(binary_bow_val_norm, labels_val, grid)

[[326  71]
 [ 49 170]]
              precision    recall  f1-score   support

           0       0.87      0.82      0.84       397
           1       0.71      0.78      0.74       219

    accuracy                           0.81       616
   macro avg       0.79      0.80      0.79       616
weighted avg       0.81      0.81      0.81       616



In [64]:
# Normalizacion de la BoW basada en frecuencias
freq_bow_tr_norm = normalize(freq_bow_tr)
freq_bow_val_norm = normalize(freq_bow_val)
grid = create_model(freq_bow_tr_norm, labels_tr)
y_pred = evaluate_model(freq_bow_val_norm, labels_val, grid)

[[324  73]
 [ 52 167]]
              precision    recall  f1-score   support

           0       0.86      0.82      0.84       397
           1       0.70      0.76      0.73       219

    accuracy                           0.80       616
   macro avg       0.78      0.79      0.78       616
weighted avg       0.80      0.80      0.80       616



In [65]:
# Normalizacion de la BoW basada en tfidf
tfidf_bow_tr_norm = normalize(tfidf_bow_tr)
tfidf_bow_val_norm = normalize(tfidf_bow_val)
grid = create_model(tfidf_bow_tr_norm, labels_tr)
y_pred = evaluate_model(tfidf_bow_val_norm, labels_val, grid)

[[320  77]
 [ 46 173]]
              precision    recall  f1-score   support

           0       0.87      0.81      0.84       397
           1       0.69      0.79      0.74       219

    accuracy                           0.80       616
   macro avg       0.78      0.80      0.79       616
weighted avg       0.81      0.80      0.80       616



<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-0pky{border-color:inherit;text-align:left;vertical-align:top}
</style>
<table class="tg">
<thead>
  <tr>
    <th class="tg-0pky" rowspan="2">Método</th>
    <th class="tg-0pky" colspan="2">Precision</th>
    <th class="tg-0pky" colspan="2">Recall</th>
    <th class="tg-0pky" colspan="2">F1-score</th>
  </tr>
  <tr>
    <th class="tg-0pky">0</th>
    <th class="tg-0pky">1</th>
    <th class="tg-0pky">0</th>
    <th class="tg-0pky">1</th>
    <th class="tg-0pky">0</th>
    <th class="tg-0pky">1</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-0pky">Binario</td>
    <td class="tg-0pky">0.87</td>
    <td class="tg-0pky">0.72</td>
    <td class="tg-0pky">0.83</td>
    <td class="tg-0pky">0.78</td>
    <td class="tg-0pky">0.85</td>
    <td class="tg-0pky">0.74</td>
  </tr>
  <tr>
    <td class="tg-0pky">Binario <br>normalizado</td>
    <td class="tg-0pky">0.87</td>
    <td class="tg-0pky">0.71</td>
    <td class="tg-0pky">0.82</td>
    <td class="tg-0pky">0.78</td>
    <td class="tg-0pky">0.84</td>
    <td class="tg-0pky">0.74</td>
  </tr>
  <tr>
    <td class="tg-0pky">Frecuencias</td>
    <td class="tg-0pky">0.87</td>
    <td class="tg-0pky">0.72</td>
    <td class="tg-0pky">0.84</td>
    <td class="tg-0pky">0.77</td>
    <td class="tg-0pky">0.85</td>
    <td class="tg-0pky">0.75</td>
  </tr>
  <tr>
    <td class="tg-0pky">Frecuencias<br>normalizado</td>
    <td class="tg-0pky">0.86</td>
    <td class="tg-0pky">0.70</td>
    <td class="tg-0pky">0.82</td>
    <td class="tg-0pky">0.76</td>
    <td class="tg-0pky">0.84</td>
    <td class="tg-0pky">0.73</td>
  </tr>
  <tr>
    <td class="tg-0pky">TFIDF</td>
    <td class="tg-0pky">0.85</td>
    <td class="tg-0pky">0.70</td>
    <td class="tg-0pky">0.83</td>
    <td class="tg-0pky">0.73</td>
    <td class="tg-0pky">0.84</td>
    <td class="tg-0pky">0.72<br></td>
  </tr>
  <tr>
    <td class="tg-0pky">TFIDF<br>normalizado</td>
    <td class="tg-0pky">0.87</td>
    <td class="tg-0pky">0.69</td>
    <td class="tg-0pky">0.81</td>
    <td class="tg-0pky">0.79</td>
    <td class="tg-0pky">0.84</td>
    <td class="tg-0pky">0.74</td>
  </tr>
</tbody>
</table>