# Tarea 2 : Minería de Texto Básica


**Esteban Reyes Saldaña**

Procesamiento del Lenguaje Natural

In [None]:
# Monto mi drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Funciones Generales

In [None]:
# Librerías necesarias
import os
import re
import nltk 
import numpy as np
from nltk.tokenize import TweetTokenizer
 
tokenizer = TweetTokenizer()

In [1]:
def get_texts_from_file (path_corpus, path_truth) :
  '''
  Lee archivos para entrenamiento
  Parámetros
  -----------
    path_corpus : Dirección del dataset
    path_truth  : Dirección de etiquetas del dataset
  Regresa
  -----------
    tr_txt      : Dataset
    tr_y        : Etiquetas del dataset
  '''
 
  tr_txt = []
  tr_y   = []
 
  with open (path_corpus, "r") as f_corpus, open(path_truth, "r", encoding='utf-8') as f_truth:
    for twitt in f_corpus:
      tr_txt += [twitt]
    for label in f_truth:
      tr_y   += [label]
    
  # Elimina salto de linea en etiqueta
  tr_y = list(map(int, tr_y))
  
  return tr_txt, tr_y

In [2]:
# Ruta para leer desde drive
dir = '/content/drive/MyDrive/CIMAT/Segundo Semestre/Lenguaje Natural/Práctica 3'

In [None]:
# Conjunto de prueba
tr_txt, tr_y = get_texts_from_file(dir + '/mex_train.txt', dir + '/mex_train_labels.txt')

In [None]:
# Conjunto de validación
val_txt, val_y = get_texts_from_file(dir + '/mex_val.txt', dir + '/mex_val_labels.txt')

In [None]:
# Tokeniza y ordena de mayor a menor frecuencia
corpus_palabras = []
for doc in tr_txt :
  corpus_palabras += tokenizer.tokenize(doc)
 
fdist = nltk.FreqDist(corpus_palabras)

In [None]:
# Ordenar  diccionario
def sortfreqDict(freqdict):
  # Creo lista de pares (freq, word)
  aux = [(freqdict[key], key) for key in freqdict]
  # Ordena e invierte
  aux.sort()
  aux.reverse()
  return aux

In [None]:
# Tomo 5000 más frecuentes
V = sortfreqDict(fdist)
V = V[:5000]

In [None]:
# Crea diccionario de acuerdo a frecuencias
dict_indices = dict()
count = 0
 
for wigth, word in V:
  dict_indices[word] = count
  count += 1

In [None]:
# Enmascara palabra con su emoción
def mask_emotion(document, word_emotions) :
  i = 0
  for word in document :
    if word in word_emotions :
      document[i] = word_emotions[word]
    i = i + 1
  return document

# Clasificación con SVM

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support, roc_auc_score

In [None]:
def svm_classification(BoW_tr, tr_y, BoW_val, tr_val) :
  parameters = {'C':[.05, .12, .25, .5, 1, 2, 4]}
 
  svr = svm.LinearSVC(class_weight = 'balanced')
  grid = GridSearchCV(estimator = svr, param_grid = parameters, n_jobs = 8, scoring= "f1_macro", cv = 5)
 
  grid.fit(BoW_tr, tr_y)
 
  y_pred = grid.predict(BoW_val)
 
  p, r, f, s = precision_recall_fscore_support(val_y, y_pred, average = 'macro', pos_label = 1) 
 
  print(confusion_matrix(val_y, y_pred))
  print(metrics.classification_report(val_y, y_pred))
 
  return [p, r, f]

# 2. Bolsa de Palabras, Bigramas y Emociones

Representa los documentos y clasifica con SVM similar al Lecture 8, pero con diferentes pesados de términos.

In [None]:
results = []

1. Evalue BoW con pesado binario

In [None]:
def build_bow_binary(tr_txt, V, dict_indices, normalized = False, emotions = False, word_emotions = {}, probabilities = False, dict_prob = {} ):
  '''
  Construye bolsa de pesado Binario
  Parámetros
  -----------
    tr_txt        : Texto en string
    dict_indices  : Diccionario de palabras ordenadas por frecuencia
    normalized    : Bool para normalizar BoW por filas
    emotions      : Bool para detertar si se debe enmascarar por emociones
    word_emotions : Diccionario de máscaras 
  Regresa
  -----------
    BoW           : Bolsa de palabras
  '''
  BOW = np.zeros((len(tr_txt), len(V)), dtype = int)
 
  cont_doc = 0
  for tr in tr_txt:
    tokens = tokenizer.tokenize(tr)
    if emotions :
      tokens = mask_emotion(tokens, word_emotions)
    fdist_doc = nltk.FreqDist(tokens)

    for word in fdist_doc:
      if word in dict_indices:
        BOW[cont_doc, dict_indices[word]] = 1
        if probabilities :
          if word in dict_prob :
            BOW[cont_doc, dict_indices[word]] *= dict_prob[word]

    
    cont_doc += 1


  if normalized :
    n = np.linalg.norm(BOW,  ord= None)
    BOW = BOW/n

  return BOW

In [None]:
BOW_binary_tr = build_bow_binary(tr_txt, V, dict_indices)
BOW_binary_tr.shape

(5544, 5000)

In [None]:
BOW_binary_val = build_bow_binary(val_txt, V, dict_indices)
BOW_binary_val.shape

(616, 5000)

## Clasificación

In [None]:
results.append(svm_classification(BOW_binary_tr, tr_y, BOW_binary_val, val_y))

[[332  65]
 [ 49 170]]
              precision    recall  f1-score   support

           0       0.87      0.84      0.85       397
           1       0.72      0.78      0.75       219

    accuracy                           0.81       616
   macro avg       0.80      0.81      0.80       616
weighted avg       0.82      0.81      0.82       616



2. Evalue BoW con pesado de frecuencias

In [None]:
 import math

def build_bow_tf(tr_txt, V, dict_indices, normalized = False, emotions = False, word_emotions = {}, probabilities = False, dict_prob = {}):
  '''
    Construye bolsa de pesado TF
    Parámetros
    -----------
      tr_txt        : Texto en string
      dict_indices  : Diccionario de palabras ordenadas por frecuencia
      normalized    : Bool para normalizar BoW por filas
      emotions      : Bool para detertar si se debe enmascarar por emociones
      word_emotions : Diccionario de máscaras
    Regresa
    -----------
      BoW           : Bolsa de palabras
  '''
  BOW = np.zeros((len(tr_txt), len(V)), dtype = np.float)
 
  cont_doc = 0
  for tr in tr_txt:
    tokens = tokenizer.tokenize(tr)
    if emotions :
      tokens = mask_emotion(tokens, word_emotions)
    fdist_doc = nltk.FreqDist(tokens)
 
    for word in fdist_doc:
      if word in dict_indices:
        BOW[cont_doc, dict_indices[word]] = fdist_doc[word]
        if probabilities :
          if word in dict_prob :
            BOW[cont_doc, dict_indices[word]] *= dict_prob[word]       

    cont_doc += 1

  if (normalized) :
    n = np.linalg.norm(BOW,  ord = None)
    BOW = BOW/n

  return BOW

In [None]:
BOW_tf_tr = build_bow_tf(tr_txt, V, dict_indices)
BOW_tf_tr.shape

(5544, 5000)

In [None]:
BOW_tf_val = build_bow_tf(val_txt, V, dict_indices)
BOW_tf_val.shape

(616, 5000)

## Clasificación

In [None]:
results.append(svm_classification(BOW_tf_tr, tr_y, BOW_tf_val, val_y))

[[334  63]
 [ 50 169]]
              precision    recall  f1-score   support

           0       0.87      0.84      0.86       397
           1       0.73      0.77      0.75       219

    accuracy                           0.82       616
   macro avg       0.80      0.81      0.80       616
weighted avg       0.82      0.82      0.82       616



## Evalue BoW con pesado tfidf

In [None]:
def get_freq_term_documents(tr_txt, dict_indices,  emotions = False, word_emotions = {}): 
  # Creo diccionario
  dict_word_doc = dict()
  # Lo lleno con valores cero
  for tr in tr_txt :
    tokens = tokenizer.tokenize(tr)
    if emotions :
      tokens = mask_emotion(tokens, word_emotions)

    fdist_doc = nltk.FreqDist(tokens)
    for word in fdist_doc:
      if word in dict_indices :
        if fdist_doc[word] > 0:
          dict_word_doc[word] = 0
  # Lleno diccionario con valor termino en documentos
  for tr in tr_txt :
    tokens = tokenizer.tokenize(tr)
    if emotions :
      tokens = mask_emotion(tokens, word_emotions)
      
    fdist_doc = nltk.FreqDist(tokens)
    for word in fdist_doc:
      if word in dict_indices :
        if fdist_doc[word] > 0:
          dict_word_doc[word] += 1
  return dict_word_doc

In [None]:
def build_bow_tfidf(tr_txt, V, dict_indices, normalized = None, emotions = False, word_emotions = {}, probabilities = False, dict_prob = {}):
  '''
  Construye bolsa de pesado TFIDF
  Parámetros
  -----------
    tr_txt        : Texto en string
    dict_indices  : Diccionario de palabras ordenadas por frecuencia
    normalized    : Bool para normalizar BoW por filas
    emotions      : Bool para detertar si se debe enmascarar por emociones
    word_emotions : Diccionario de máscaras
  Regresa
  -----------
    BoW           : Bolsa de palabras
  '''
  BOW = np.zeros((len(tr_txt), len(V)), dtype = np.float)

  cont_doc = 0
  
  # Tamaño del vocabulario
  N = len(V)

  # Cargo frecuencias termino documentos
  dict_word_doc = get_freq_term_documents(tr_txt, dict_indices, emotions = emotions, word_emotions = word_emotions)
 
  for tr in tr_txt:
    # Creo conjunto de frencuencias
    tokens = tokenizer.tokenize(tr)
    if emotions :
      tokens = mask_emotion(tokens, word_emotions)
    fdist_doc = nltk.FreqDist(tokens)

    for word in fdist_doc:
      if word in dict_indices:
        BOW[cont_doc, dict_indices[word]] = fdist_doc[word] * math.log10(N / (dict_word_doc[word] + 1) )
        if probabilities :
          if word in dict_prob :
            BOW[cont_doc, dict_indices[word]] *= dict_prob[word] 
    cont_doc += 1

  if (normalized) :
    n = np.linalg.norm(BOW,  ord = None)
    BOW = BOW/n
  return BOW

In [None]:
BOW_tfidf_tr = build_bow_tfidf(tr_txt, V, dict_indices)
BOW_tfidf_tr.shape

(5544, 5000)

In [None]:
BOW_tfidf_val = build_bow_tfidf(val_txt, V, dict_indices)
BOW_tfidf_val.shape

(616, 5000)

## Clasificación

In [None]:
results.append(svm_classification(BOW_tfidf_tr, tr_y, BOW_tfidf_val, val_y))

[[277 120]
 [ 34 185]]
              precision    recall  f1-score   support

           0       0.89      0.70      0.78       397
           1       0.61      0.84      0.71       219

    accuracy                           0.75       616
   macro avg       0.75      0.77      0.74       616
weighted avg       0.79      0.75      0.76       616



4. Evalue BoW con pesado binario normalizado l2 (no use sklearn)

In [None]:
BOW_binary_tr_normalized = build_bow_binary(tr_txt, V, dict_indices, normalized= True)
BOW_binary_tr.shape

(5544, 5000)

In [None]:
BOW_binary_val_normalized = build_bow_binary(val_txt, V, dict_indices, normalized = True)
BOW_binary_val.shape

(616, 5000)

In [None]:
results.append(svm_classification(BOW_binary_tr_normalized, tr_y, BOW_binary_val, val_y))

[[182 215]
 [ 22 197]]
              precision    recall  f1-score   support

           0       0.89      0.46      0.61       397
           1       0.48      0.90      0.62       219

    accuracy                           0.62       616
   macro avg       0.69      0.68      0.62       616
weighted avg       0.74      0.62      0.61       616



5. Evalue BoW con pesado de frecuencia normalizado l2 (no use sklearn)

In [None]:
BOW_tf_tr_normalized = build_bow_tf(tr_txt, V, dict_indices, normalized= True)
BOW_tf_tr.shape

(5544, 5000)

In [None]:
BOW_tf_val_normalized = build_bow_tf(val_txt, V, dict_indices, normalized = True)
BOW_tf_val.shape

(616, 5000)

In [None]:
results.append(svm_classification(BOW_tf_tr_normalized, tr_y, BOW_tf_val, val_y))

[[148 249]
 [ 20 199]]
              precision    recall  f1-score   support

           0       0.88      0.37      0.52       397
           1       0.44      0.91      0.60       219

    accuracy                           0.56       616
   macro avg       0.66      0.64      0.56       616
weighted avg       0.73      0.56      0.55       616



6. Evalue BoW con pesado tfidf normalizado l2 (no use sklearn)

In [None]:
BOW_tfidf_tr_normalized = build_bow_tfidf(tr_txt, V, dict_indices, normalized= True)
BOW_tfidf_tr.shape

(5544, 5000)

In [None]:
BOW_tfidf_val_normalized = build_bow_tfidf(val_txt, V, dict_indices, normalized = True)
BOW_tfidf_val.shape

(616, 5000)

In [None]:
results.append(svm_classification(BOW_tfidf_tr_normalized, tr_y, BOW_tfidf_val, val_y))

[[210 187]
 [ 29 190]]
              precision    recall  f1-score   support

           0       0.88      0.53      0.66       397
           1       0.50      0.87      0.64       219

    accuracy                           0.65       616
   macro avg       0.69      0.70      0.65       616
weighted avg       0.75      0.65      0.65       616



7. Ponga una tabla comparativa a modo de resumen con las seis entradas anteriores

In [None]:
from tabulate import tabulate

In [None]:
show_results =  [['Binary'] + results[0], 
                ['TF'] + results[1],
                ['TFIDF'] + results[2],
                ['Binary Normalized'] + results[3],
                ['TF Normalized'] + results[4],
                ['TDIDF Normaized'] + results[5]]

In [None]:
print(tabulate(show_results, headers= ['Algorithm','Precision', 'Recall', 'f1 Score']))

Algorithm            Precision    Recall    f1 Score
-----------------  -----------  --------  ----------
Binary                0.797398  0.806264    0.801185
TF                    0.79912   0.8065      0.80238
TFIDF                 0.748616  0.771241    0.744296
Binary Normalized     0.685156  0.678991    0.615031
TF Normalized         0.662574  0.640736    0.560298
TDIDF Normaized       0.69132   0.698274    0.648981


8. De las configuraciones anteriores elija la mejor y evalúela con más y menos términos (e.g., 1000 y 7000). Ponga una tabla dónde compare las tres configuraciones.

In [None]:
results = []

n = 1000

In [None]:
V_1000 = sortfreqDict(fdist)
V_1000 = V_1000[:1000]

# Crea diccionario de acuerdo a frecuencias
dict_indices_1000 = dict()
count = 0
 
for wigth, word in V_1000:
  dict_indices_1000[word] = count
  count += 1

In [None]:
BOW_tf_tr_1000 = build_bow_tf(tr_txt, V_1000, dict_indices_1000)
BOW_tf_tr_1000.shape

(5544, 1000)

In [None]:
BOW_tf_val_1000 = build_bow_tf(val_txt, V_1000, dict_indices_1000)
BOW_tf_val_1000.shape

(616, 1000)

In [None]:
results.append(svm_classification(BOW_tf_tr_1000, tr_y, BOW_tf_val_1000, val_y))

[[334  63]
 [ 48 171]]
              precision    recall  f1-score   support

           0       0.87      0.84      0.86       397
           1       0.73      0.78      0.75       219

    accuracy                           0.82       616
   macro avg       0.80      0.81      0.81       616
weighted avg       0.82      0.82      0.82       616



n = 6000

In [None]:
V_6000 = sortfreqDict(fdist)
V_6000 = V_6000[:6000]

# Crea diccionario de acuerdo a frecuencias
dict_indices_6000 = dict()
count = 0
 
for wigth, word in V_6000:
  dict_indices_6000[word] = count
  count += 1

In [None]:
BOW_tf_tr_6000 = build_bow_tf(tr_txt, V_6000, dict_indices_6000)
BOW_tf_tr_6000.shape

(5544, 6000)

In [None]:
BOW_tf_val_6000 = build_bow_tf(val_txt, V_6000, dict_indices_6000)
BOW_tf_val_6000.shape

(616, 6000)

In [None]:
results.append(svm_classification(BOW_tf_tr_6000, tr_y, BOW_tf_val_6000, val_y))

[[334  63]
 [ 50 169]]
              precision    recall  f1-score   support

           0       0.87      0.84      0.86       397
           1       0.73      0.77      0.75       219

    accuracy                           0.82       616
   macro avg       0.80      0.81      0.80       616
weighted avg       0.82      0.82      0.82       616



n = 7000

In [None]:
V_7000 = sortfreqDict(fdist)
V_7000 = V_7000[:7000]

# Crea diccionario de acuerdo a frecuencias
dict_indices_7000 = dict()
count = 0
 
for wigth, word in V_7000:
  dict_indices_7000[word] = count
  count += 1

In [None]:
BOW_tf_tr_7000 = build_bow_tf(tr_txt, V_7000, dict_indices_7000)
BOW_tf_tr_7000.shape

(5544, 7000)

In [None]:
BOW_tf_val_7000 = build_bow_tf(val_txt, V_7000, dict_indices_7000)
BOW_tf_val_7000.shape

(616, 7000)

In [None]:
results.append(svm_classification(BOW_tf_tr_7000, tr_y, BOW_tf_val_7000, val_y))

[[334  63]
 [ 50 169]]
              precision    recall  f1-score   support

           0       0.87      0.84      0.86       397
           1       0.73      0.77      0.75       219

    accuracy                           0.82       616
   macro avg       0.80      0.81      0.80       616
weighted avg       0.82      0.82      0.82       616



Todos los datos

In [None]:
V_All = sortfreqDict(fdist)

# Crea diccionario de acuerdo a frecuencias
dict_indices_All = dict()
count = 0
 
for wigth, word in V_All:
  dict_indices_All[word] = count
  count += 1

In [None]:
BOW_tf_tr_All = build_bow_tf(tr_txt, V_All, dict_indices_All)
BOW_tf_tr_All.shape

(5544, 13523)

In [None]:
BOW_tf_val_All = build_bow_tf(val_txt, V_All, dict_indices_All)
BOW_tf_val_All.shape

(616, 13523)

In [None]:
results.append(svm_classification(BOW_tf_tr_All, tr_y, BOW_tf_val_All, val_y))



[[336  61]
 [ 52 167]]
              precision    recall  f1-score   support

           0       0.87      0.85      0.86       397
           1       0.73      0.76      0.75       219

    accuracy                           0.82       616
   macro avg       0.80      0.80      0.80       616
weighted avg       0.82      0.82      0.82       616



### Resultados

In [None]:
show_results =  [ ['TF (1000)'] + results[0], ['TF (6000)'] + results[1], ['TF (7000)'] + results[2],['TF (13523)'] + results[3] ]

In [None]:
print(tabulate(show_results, headers= ['Algorithm','Precision', 'Recall', 'f1 Score']))

Algorithm      Precision    Recall    f1 Score
-----------  -----------  --------  ----------
TF (1000)       0.802557  0.811066    0.806238
TF (6000)       0.79912   0.8065      0.80238
TF (7000)       0.79912   0.8065      0.80238
TF (13523)      0.799218  0.804452    0.801627


9. De las configuraciones anteriores elija la mejor y concatene una bolsa-de-bigramas de tamaño 1000 con algún pesado que usted determine y normalizada con l2. Evalúela en clasificación y discuta si ve mejoría.

**Nota:** Conforme se aumenta la cantidad de palabras se obtienen mejores resultados, se tomarán 6000 para evitar warnings

In [None]:
# Tokeniza y ordena de mayor a menor frecuencia
corpus_bigramas = []
for doc in tr_txt :
  corpus_bigramas += nltk.bigrams(tokenizer.tokenize(doc))
 
fdist_bigrams = nltk.FreqDist(corpus_bigramas)

In [None]:
# Tomo 1000 más frecuentes
B = sortfreqDict(fdist_bigrams)
B = B[:1000]

In [None]:
# Crea diccionario de acuerdo a frecuencias
dict_indices_bigrams = dict()
count = 0
for wigth, word in B:
  dict_indices_bigrams[word] = count
  count += 1

In [None]:
def build_bow_tf_bigrams(tr_txt, V, dict_indices, normalized = False):
  BOW = np.zeros((len(tr_txt), len(V)), dtype = np.float)

  cont_doc = 0
  for tr in tr_txt :
    # Creo conjunto de frencuencias
    fdist_doc = nltk.FreqDist(nltk.bigrams(tokenizer.tokenize(tr)))
 
    for word in fdist_doc :
      if word in dict_indices:
        BOW[cont_doc, dict_indices[word]] = math.log10(1 + fdist_doc[word])    
    cont_doc += 1

  if normalized :
    n = np.linalg.norm(BOW,  ord = None)
    BOW = BOW/n

  return BOW

In [None]:
BOW_tf_tr_bigrams = build_bow_tf_bigrams(tr_txt, B, dict_indices_bigrams, normalized=True)
BOW_tf_tr_bigrams.shape

(5544, 1000)

In [None]:
BOW_tf_val_bigrams = build_bow_tf_bigrams(val_txt, B, dict_indices_bigrams, normalized= True)
BOW_tf_val_bigrams.shape

(616, 1000)

### Concatenación

In [None]:
training = np.concatenate((BOW_tf_tr_6000, BOW_tf_tr_bigrams), axis = 1)
testing = np.concatenate((BOW_tf_val_6000, BOW_tf_val_bigrams), axis = 1)
print(training.shape)
print(testing.shape)

(5544, 7000)
(616, 7000)


In [None]:
results.append(svm_classification(training, tr_y, testing, val_y))

[[334  63]
 [ 50 169]]
              precision    recall  f1-score   support

           0       0.87      0.84      0.86       397
           1       0.73      0.77      0.75       219

    accuracy                           0.82       616
   macro avg       0.80      0.81      0.80       616
weighted avg       0.82      0.82      0.82       616



In [None]:
show_results =  [ ['TF (1000)'] + results[0], ['TF (6000)'] + results[1], ['TF (7000)'] + results[2],['TF (13523)'] + results[3], ['TF (6000) + BoE'] + results[4]  ]

In [None]:
print(tabulate(show_results, headers= ['Algorithm','Precision', 'Recall', 'f1 Score']))

Algorithm          Precision    Recall    f1 Score
---------------  -----------  --------  ----------
TF (1000)           0.802557  0.811066    0.806238
TF (6000)           0.79912   0.8065      0.80238
TF (7000)           0.79912   0.8065      0.80238
TF (13523)          0.799218  0.804452    0.801627
TF (6000) + BoE     0.79912   0.8065      0.80238


**Nota:** El rendimiento se mantuvo igual agregando bigramas, esto se debe a los bigramas casi no se repiten, entonces la matriz que se concatena es casi cero, más sparse que la matriz de BoW

10. Utilice el recurso léxico del Consejo Nacional de Investigación de Canadá llamado "EmoLex" (https://www.saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm) para
construir una "Bolsa de Emociones" de los Tweets de agresividad (Debe usar EmoLex en Español). Para esto, una estrategia sencilla sería enmascarar cada palabra con su emoción, y después construir la Bolsa de Emociones (BoE).

In [None]:
dir = '/content/drive/MyDrive/CIMAT/Segundo Semestre/Lenguaje Natural/T02'

Para extraer las máscaras le pedí ayuda a **Abdiel** porque no había notado que había palabras repetidas, entonces si hay repetidas se toma la de mayor probabilidad y si hay dos con la misma probablidad se lanza un volado

In [None]:
def construct_BoE_dic(path_corpus) :
  with open (path_corpus, "r", encoding = 'utf-8') as f :
    word_emotions = dict()
    oneScore      = dict()
    header = 0
    for line in f :
      if header == 0 :
        tmp = line.split()
        header = 1
        continue
      else :
        words = re.split(r'\t+', line)
        Wscore = float(words[3])
        if words[1] != 'NO TRANSLATION' :
          w = words[1].lower()
          if not w in word_emotions :
            word_emotions[w] = words[2]
            oneScore[w]      = Wscore
          else :
            if Wscore > oneScore[w] :
              word_emotions[w] = words[2]
              oneScore[w]      = Wscore
            elif Wscore == oneScore[w] :
              if np.random.rand() >= 0.5 :
                word_emotions[w] = words[2]
          
            
  return word_emotions

In [None]:
word_emotions = construct_BoE_dic(dir + "/Spanish-es-NRC-Emotion-Intensity-Lexicon-v1.txt")
print(len(word_emotions))

4695


In [None]:
corpus_emotions = []
for doc in tr_txt :
  tokens = tokenizer.tokenize(doc)
  tokens = mask_emotion(tokens, word_emotions)
  corpus_emotions += tokens
fdist = nltk.FreqDist(corpus_emotions)

In [None]:
# Tomo 5000 más frecuentes
VE = sortfreqDict(fdist)
VE = VE[:5000]

In [None]:
# Crea diccionario de acuerdo a frecuencias
dict_indices_emotions = dict()
count = 0
 
for wigth, word in VE:
  dict_indices_emotions[word] = count
  count += 1

## Bolsa con Pesado Binario

In [None]:
BOW_binary_tr = build_bow_binary(tr_txt, VE, dict_indices_emotions, emotions = True, word_emotions = word_emotions)
BOW_binary_tr.shape

(5544, 5000)

In [None]:
BOW_binary_val = build_bow_binary(val_txt, VE, dict_indices_emotions, emotions = True, word_emotions = word_emotions)
BOW_binary_val.shape

(616, 5000)

### Normalizada

In [None]:
BOW_binary_tr_normalized = build_bow_binary(tr_txt, VE, dict_indices_emotions,normalized= True, emotions = True, word_emotions = word_emotions)
BOW_binary_tr_normalized.shape

(5544, 5000)

In [None]:
BOW_binary_val_normalized = build_bow_binary(val_txt, VE, dict_indices_emotions, normalized = True, emotions = True, word_emotions = word_emotions)
BOW_binary_val_normalized.shape

(616, 5000)

## Bolsa de Pesado TF

In [None]:
BOW_tf_tr = build_bow_tf(tr_txt, VE, dict_indices_emotions, emotions = True, word_emotions = word_emotions)
BOW_tf_tr.shape

(5544, 5000)

In [None]:
BOW_tf_val = build_bow_tf(val_txt, VE, dict_indices_emotions, emotions = True, word_emotions = word_emotions)
BOW_tf_val.shape

(616, 5000)

### Normalizada

In [None]:
BOW_tf_tr_normalized = build_bow_tf(tr_txt, VE, dict_indices_emotions,normalized= True, emotions=True, word_emotions = word_emotions)
BOW_tf_tr_normalized.shape

(5544, 5000)

In [None]:
BOW_tf_val_normalized = build_bow_tf(val_txt, VE, dict_indices_emotions, normalized = True, emotions = True, word_emotions = word_emotions)
BOW_tf_val_normalized.shape

(616, 5000)

## Bolsa de Pesado TFIDF

In [None]:
mask_emotion(tr_txt[0], word_emotions)

'lo peor de todo es que no me dan por un tiempo y luego vuelven estoy hasta la verga de estl\n'

In [None]:
BOW_tfidf_tr = build_bow_tfidf(tr_txt, VE, dict_indices_emotions, emotions = True, word_emotions = word_emotions)
BOW_tfidf_tr.shape

(5544, 5000)

In [None]:
BOW_tfidf_val = build_bow_tfidf(val_txt, VE, dict_indices_emotions, emotions = True, word_emotions = word_emotions)
BOW_tfidf_val.shape

(616, 5000)

### Normalizada

In [None]:
BOW_tfidf_tr_normalized = build_bow_tfidf(tr_txt, VE, dict_indices_emotions,normalized= True, emotions=True, word_emotions = word_emotions)
BOW_tfidf_tr_normalized.shape

(5544, 5000)

In [None]:
BOW_tfidf_val_normalized = build_bow_tfidf(val_txt, VE, dict_indices_emotions, normalized = True, emotions = True, word_emotions = word_emotions)
BOW_tfidf_val_normalized.shape

(616, 5000)

11. Evaluá tú BoE clasificando con SVM. Ponga una tabla comparativa a modo de resumen con los tres pesados, normalize cada uno si lo cree conveniente.

In [None]:
results = []

In [None]:
results.append(svm_classification(BOW_binary_tr, tr_y, BOW_binary_val, val_y))
results.append(svm_classification(BOW_tf_tr, tr_y, BOW_tf_val, val_y))
results.append(svm_classification(BOW_tfidf_tr, tr_y, BOW_tfidf_val, val_y))
results.append(svm_classification(BOW_binary_tr_normalized, tr_y, BOW_tf_val_normalized, val_y))
results.append(svm_classification(BOW_tf_tr_normalized, tr_y, BOW_tf_val_normalized, val_y))
results.append(svm_classification(BOW_tfidf_tr_normalized, tr_y, BOW_tfidf_val_normalized, val_y))

[[331  66]
 [ 57 162]]
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       397
           1       0.71      0.74      0.72       219

    accuracy                           0.80       616
   macro avg       0.78      0.79      0.78       616
weighted avg       0.80      0.80      0.80       616

[[330  67]
 [ 54 165]]
              precision    recall  f1-score   support

           0       0.86      0.83      0.85       397
           1       0.71      0.75      0.73       219

    accuracy                           0.80       616
   macro avg       0.79      0.79      0.79       616
weighted avg       0.81      0.80      0.80       616

[[274 123]
 [ 34 185]]
              precision    recall  f1-score   support

           0       0.89      0.69      0.78       397
           1       0.60      0.84      0.70       219

    accuracy                           0.75       616
   macro avg       0.75      0.77      0.74       616
weigh

In [None]:
show_results =  [['Binary'] + results[0], 
                ['TF'] + results[1],
                ['TFIDF'] + results[2],
                ['Binary Normalized'] + results[3],
                ['TF Normalized'] + results[4],
                ['TDIDF Normaized'] + results[5]]

In [None]:
print(tabulate(show_results, headers= ['Algorithm','Precision', 'Recall', 'f1 Score']))

Algorithm            Precision    Recall    f1 Score
-----------------  -----------  --------  ----------
Binary                0.78181   0.78674     0.784072
TF                    0.785291  0.792329    0.788389
TFIDF                 0.74513   0.767463    0.739696
Binary Normalized     0.665512  0.676386    0.637927
TF Normalized         0.672248  0.681504    0.638635
TDIDF Normaized       0.682325  0.696065    0.659914


# 3. Recurso Lingüístico de Emociones Mexicano

1. Utilice el recurso léxico llamado "Spanish Emotion Lexicon (SEL)" del Dr. Grigori Sidorov, profesor del Centro de Investigación en Computación (CIC) del Instituto Politecnico Nacional (http://www.cic.ipn.mx/~sidorov/), para enmascarar cada palabra con su emoción,
y después construir la Bolsa de Emociones con algún pesado (e.g., binario, tf, tfidf). Proponga alguna estrategia para incorporar el "valor" del "Probability Factor of Affective
use" en su representación vectorial del documento. Evalúa y escribe una tabla comparativa a modo de resumen con al menos tres pesados: binario, frecuencia, tiidf. Normalize cada pesado según lo crea conveniente de acuerdo el experimento (1).

In [None]:
def construct_BoE_Info(path_corpus) :
  with open (path_corpus, "r", encoding = 'latin-1') as f :
    word_emotions = dict()
    oneScore      = dict()
    header = 0
    for line in f :
      if header == 0 :
        tmp = line.split()
        header = 1
        continue
      else :
        words = re.split(r'\t+', line)
        Wscore = float(words[1])
        w = words[0].lower()
        if not w in word_emotions :
          word_emotions[w] = words[2]
          oneScore[w]      = Wscore
        else :
          if Wscore > oneScore[w] :
            word_emotions[w] = words[2]
            oneScore[w]      = Wscore
          elif Wscore == oneScore[w] :
            if np.random.rand() >= 0.5 :
              word_emotions[w] = words[2]
          
            
  return word_emotions, oneScore

In [None]:
emotion_dict, emotion_prob = construct_BoE_Info(dir + '/SEL.txt')
print(len(emotion_dict))
print(len(emotion_prob))

1909
1909


In [None]:
# Tokeniza y ordena de mayor a menor frecuencia
corpus_emotions = []
for doc in tr_txt :
  tokens = tokenizer.tokenize(doc)
  tokens = mask_emotion(tokens, emotion_dict)
  corpus_emotions += tokens
 
fdist = nltk.FreqDist(corpus_emotions)

In [None]:
# Tomo 5000 más frecuentes
VSEL = sortfreqDict(fdist)
VSEL = VSEL[:5000]

In [None]:
# Crea diccionario de acuerdo a frecuencias
dict_indices_emotions = dict()
count = 0
 
for wigth, word in VE:
  dict_indices_emotions[word] = count
  count += 1

## Bolsa con Pesado Binario

In [None]:
BOW_binary_tr = build_bow_binary(tr_txt, VSEL, dict_indices_emotions, emotions = True, word_emotions = emotion_dict, probabilities = True, dict_prob = emotion_prob)


In [None]:
BOW_binary_val = build_bow_binary(val_txt, VSEL, dict_indices_emotions, emotions = True, word_emotions = word_emotions)
BOW_binary_val.shape

(616, 5000)

### Normalizada

In [None]:
BOW_binary_tr_normalized = build_bow_binary(tr_txt, VSEL, dict_indices_emotions,normalized= True, emotions = True, word_emotions = word_emotions, probabilities = True, dict_prob = emotion_prob)
BOW_binary_tr_normalized.shape

(5544, 5000)

In [None]:
BOW_binary_val_normalized = build_bow_binary(val_txt, VSEL, dict_indices_emotions, normalized = True, emotions = True, word_emotions = word_emotions,probabilities = True, dict_prob = emotion_prob)
BOW_binary_val_normalized.shape

(616, 5000)

## Bolsa de Pesado TF

In [None]:
BOW_tf_tr = build_bow_tf(tr_txt, VSEL, dict_indices_emotions, emotions = True, word_emotions = word_emotions, probabilities = True, dict_prob = emotion_prob)
BOW_tf_tr.shape

(5544, 5000)

In [None]:
BOW_tf_val = build_bow_tf(val_txt, VSEL, dict_indices_emotions, emotions = True, word_emotions = word_emotions, probabilities = True, dict_prob = emotion_prob)
BOW_tf_val.shape

(616, 5000)

### Normalizada

In [None]:
BOW_tf_tr_normalized = build_bow_tf(tr_txt, VSEL, dict_indices_emotions,normalized= True, emotions=True, word_emotions = word_emotions, probabilities = True, dict_prob = emotion_prob)
BOW_tf_tr_normalized.shape

(5544, 5000)

In [None]:
BOW_tf_val_normalized = build_bow_tf(val_txt, VSEL, dict_indices_emotions, normalized = True, emotions = True, word_emotions = word_emotions, probabilities = True, dict_prob = emotion_prob)
BOW_tf_val_normalized.shape

(616, 5000)

## Bolsa de Pesado TFIDF

In [None]:
BOW_tfidf_val_normalized = build_bow_tfidf(val_txt, VSEL, dict_indices_emotions, normalized = True, emotions = True, word_emotions = word_emotions, probabilities = True, dict_prob = emotion_prob)
BOW_tfidf_tr.shape

(5544, 5000)

In [None]:
BOW_tfidf_val_normalized = build_bow_tfidf(val_txt, VSEL, dict_indices_emotions, normalized = True, emotions = True, word_emotions = word_emotions, probabilities = True, dict_prob = emotion_prob)
BOW_tfidf_val.shape

(616, 5000)

### Normalizada

In [None]:
BOW_tfidf_tr_normalized = build_bow_tfidf(tr_txt, VSEL, dict_indices_emotions,normalized= True, emotions=True, word_emotions = word_emotions, probabilities = True, dict_prob = emotion_prob)
BOW_tfidf_tr_normalized.shape

(5544, 5000)

In [None]:
BOW_tfidf_val_normalized = build_bow_tfidf(val_txt, VSEL, dict_indices_emotions, normalized = True, emotions = True, word_emotions = word_emotions, probabilities = True, dict_prob = emotion_prob)
BOW_tfidf_val_normalized.shape

(616, 5000)

11. Evaluá tú BoE clasificando con SVM. Ponga una tabla comparativa a modo de resumen con los tres pesados, normalize cada uno si lo cree conveniente.

In [None]:
results = []

In [None]:
results.append(svm_classification(BOW_binary_tr, tr_y, BOW_binary_val, val_y))
results.append(svm_classification(BOW_tf_tr, tr_y, BOW_tf_val, val_y))
results.append(svm_classification(BOW_tfidf_tr, tr_y, BOW_tfidf_val, val_y))
results.append(svm_classification(BOW_binary_tr_normalized, tr_y, BOW_tf_val_normalized, val_y))
results.append(svm_classification(BOW_tf_tr_normalized, tr_y, BOW_tf_val_normalized, val_y))
results.append(svm_classification(BOW_tfidf_tr_normalized, tr_y, BOW_tfidf_val_normalized, val_y))

[[336  61]
 [ 56 163]]
              precision    recall  f1-score   support

           0       0.86      0.85      0.85       397
           1       0.73      0.74      0.74       219

    accuracy                           0.81       616
   macro avg       0.79      0.80      0.79       616
weighted avg       0.81      0.81      0.81       616

[[331  66]
 [ 55 164]]
              precision    recall  f1-score   support

           0       0.86      0.83      0.85       397
           1       0.71      0.75      0.73       219

    accuracy                           0.80       616
   macro avg       0.79      0.79      0.79       616
weighted avg       0.81      0.80      0.80       616

[[274 123]
 [ 34 185]]
              precision    recall  f1-score   support

           0       0.89      0.69      0.78       397
           1       0.60      0.84      0.70       219

    accuracy                           0.75       616
   macro avg       0.75      0.77      0.74       616
weigh

In [None]:
show_results =  [['Binary'] + results[0], 
                ['TF'] + results[1],
                ['TFIDF'] + results[2],
                ['Binary Normalized'] + results[3],
                ['TF Normalized'] + results[4],
                ['TDIDF Normaized'] + results[5]]

In [None]:
print(tabulate(show_results, headers= ['Algorithm','Precision', 'Recall', 'f1 Score']))

Algorithm            Precision    Recall    f1 Score
-----------------  -----------  --------  ----------
Binary                0.792411  0.79532     0.793801
TF                    0.785278  0.791306    0.787989
TFIDF                 0.74513   0.767463    0.739696
Binary Normalized     0.666527  0.677645    0.639477
TF Normalized         0.671268  0.680245    0.637068
TDIDF Normaized       0.683353  0.697325    0.661455


2. En un comentario aparte, discuta sobre la estrategía que utilizó para incorporar el "Probability Factor of Affective use". No más de 5 renglones.

La probabilidad de uso afectivo de alguna manera pondera a la palabra dentro del texto. Entonces el pesado de la palabra se debe operar con la PFA para darle esta información a la BoE. Así que reviso si dicha palabra está en los tokens y de ser verdadero multiplico su peso por la PFA.

# 4. ¿Le podemos ganar a BoW con Bigramas?

1. Combine algo de emociones con Bolsa de Palabras y Bigramas. Para construir la representación final del documento utilice la concatenación de las representaciones según sus observaciones (e.g., Bolsa de Palabras + Bolsa de Bigramas + Bolsa de Sentimientos de
Canada + Bolsa de Sentimientos de Grigori), y aliméntelas a un SVM.

En la experimentación anterior se observó que TF obtuvo mejores resultados, por lo que se realizará el pesado con TF tanto para BOW, Bigramas y BoE. Además, no se normalizará porque se observó que en todas las pruebas la normalización emperó el rendimiento del SVM.

### Resultados con BoW obtenidos al principio

```
Algorithm            Precision    Recall    f1 Score
-----------------  -----------  --------  ----------
Binary                0.797398  0.806264    0.801185
TF                    0.79912   0.8065      0.80238
TFIDF                 0.748616  0.771241    0.744296
Binary Normalized     0.685156  0.678991    0.615031
TF Normalized         0.662574  0.640736    0.560298
TDIDF Normaized       0.69132   0.698274    0.648981
```



### BoW

In [None]:
# Tomo 5000 más frecuentes
V = sortfreqDict(fdist)
V = V[:5000]

In [None]:
# Crea diccionario de acuerdo a frecuencias
dict_indices = dict()
count = 0
 
for wigth, word in V:
  dict_indices[word] = count
  count += 1

In [None]:
BOW_tf_tr = build_bow_tf(tr_txt, V, dict_indices)
BOW_tf_tr.shape

(5544, 5000)

In [None]:
BOW_tf_val = build_bow_tf(val_txt, V, dict_indices)
BOW_tf_val.shape

(616, 5000)

### Bigramas

In [None]:
# Tokeniza y ordena de mayor a menor frecuencia
corpus_bigramas = []
for doc in tr_txt :
  corpus_bigramas += nltk.bigrams(tokenizer.tokenize(doc))
 
fdist_bigrams = nltk.FreqDist(corpus_bigramas)

In [None]:
# Tomo 1000 más frecuentes
B = sortfreqDict(fdist_bigrams)
B = B[:1000]

In [None]:
# Crea diccionario de acuerdo a frecuencias
dict_indices_bigrams = dict()
count = 0
for wigth, word in B:
  dict_indices_bigrams[word] = count
  count += 1

In [None]:
BOW_tf_tr_bigrams = build_bow_tf_bigrams(tr_txt, B, dict_indices_bigrams, normalized=True)
BOW_tf_tr_bigrams.shape

(5544, 1000)

In [None]:
BOW_tf_val_bigrams = build_bow_tf_bigrams(val_txt, B, dict_indices_bigrams, normalized= True)
BOW_tf_val_bigrams.shape

(616, 1000)

In [None]:
training = np.concatenate((BOW_tf_tr, BOW_tf_tr_bigrams), axis = 1)
testing = np.concatenate((BOW_tf_val, BOW_tf_val_bigrams), axis = 1)
print(training.shape)
print(testing.shape)

(5544, 6000)
(616, 6000)


### Emociones 1

In [None]:
word_emotions = construct_BoE_dic(dir + "/Spanish-es-NRC-Emotion-Intensity-Lexicon-v1.txt")
print(len(word_emotions))

4695


In [None]:
corpus_emotions = []
for doc in tr_txt :
  tokens = tokenizer.tokenize(doc)
  tokens = mask_emotion(tokens, word_emotions)
  corpus_emotions += tokens
fdist = nltk.FreqDist(corpus_emotions)

In [None]:
# Tomo 5000 más frecuentes
VE = sortfreqDict(fdist)
VE = VE[:1000]

In [None]:
# Crea diccionario de acuerdo a frecuencias
dict_indices_emotions = dict()
count = 0
 
for wigth, word in VE:
  dict_indices_emotions[word] = count
  count += 1

In [None]:
BOW_tf_tr_VE = build_bow_tf(tr_txt, VE, dict_indices_emotions, emotions = True, word_emotions = word_emotions)
BOW_tf_tr_VE.shape

(5544, 1000)

In [None]:
BOW_tf_val_VE = build_bow_tf(val_txt, VE, dict_indices_emotions, emotions = True, word_emotions = word_emotions)
BOW_tf_val.shape

(616, 5000)

In [None]:
training = np.concatenate((training, BOW_tf_tr_VE), axis = 1)
testing = np.concatenate((testing, BOW_tf_val_VE), axis = 1)
print(training.shape)
print(testing.shape)

(5544, 7000)
(616, 7000)


### Emociones 2

In [None]:
emotion_dict, emotion_prob = construct_BoE_Info(dir + '/SEL.txt')
print(len(emotion_dict))
print(len(emotion_prob))

1909
1909


In [None]:
# Tokeniza y ordena de mayor a menor frecuencia
corpus_emotions = []
for doc in tr_txt :
  tokens = tokenizer.tokenize(doc)
  tokens = mask_emotion(tokens, emotion_dict)
  corpus_emotions += tokens
 
fdist = nltk.FreqDist(corpus_emotions)

In [None]:
# Tomo 5000 más frecuentes
VSEL = sortfreqDict(fdist)
VSEL = VSEL[:1000]

In [None]:
# Crea diccionario de acuerdo a frecuencias
dict_indices_emotions = dict()
count = 0
 
for wigth, word in VE:
  dict_indices_emotions[word] = count
  count += 1

In [None]:
BOW_tf_tr_SEL = build_bow_tf(tr_txt, VSEL, dict_indices_emotions, emotions = True, word_emotions = word_emotions, probabilities = True, dict_prob = emotion_prob)
BOW_tf_tr_SEL.shape

(5544, 1000)

In [None]:
BOW_tf_val_SEL = build_bow_tf(val_txt, VSEL, dict_indices_emotions, emotions = True, word_emotions = word_emotions, probabilities = True, dict_prob = emotion_prob)
BOW_tf_val_SEL.shape

(616, 1000)

In [None]:
training = np.concatenate((training, BOW_tf_tr_SEL), axis = 1)
testing = np.concatenate((testing, BOW_tf_val_SEL), axis = 1)
print(training.shape)
print(testing.shape)

(5544, 8000)
(616, 8000)


### Resultados

In [None]:
results = []

In [None]:
results.append(svm_classification(training, tr_y, testing, val_y))



[[330  67]
 [ 53 166]]
              precision    recall  f1-score   support

           0       0.86      0.83      0.85       397
           1       0.71      0.76      0.73       219

    accuracy                           0.81       616
   macro avg       0.79      0.79      0.79       616
weighted avg       0.81      0.81      0.81       616



In [None]:
show_results =  [['Final'] + results[0]]

In [None]:
print(tabulate(show_results, headers= ['Algorithm','Precision', 'Recall', 'f1 Score']))

Algorithm      Precision    Recall    f1 Score
-----------  -----------  --------  ----------
Final           0.787033  0.794613    0.790334


2. Elabore conclusiones sobre toda esta Tarea, incluyendo observaciones, comentarios y posibles mejoras futuras. Discuta que tanto pudo mejorar la BoW con Bigramas, o ¿empeoró?. Discuta también brevemente el costo computacional de los experimentos.
Sea breve: todo en NO más de un párrafo; máximo dos.

Para el conjunto de datos dado, usando las 5000 parabras más frecuentes, los resultados indicaron que en todos los indicadores, el pesado TF se comportó mejor que el pesado binario y el TFIDF. Además se observó que cuando se concatenaron bigramas, el resultado con pesado TF se maltuvo casi igual al original. Lo mismo se observó al hacer a la BoW robusta con bigramas y emociones.

El costo computacional aumentó significativamente (al menos en tiempo) cuando se utilizaron matrices más grandes. Un problema observado fue que casi todas las palabras en el corpus no tenían emociones asociadas, en parte por la manera informal de usar el lenguaje, entonces se podría mejorar puliendo el corpus para identificar dichas palabras dentro de la BoE.