# Modelos básicos

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
import seaborn as sns

import re
from nltk.tokenize import TweetTokenizer
import spacy
from utils import preprocessing

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

from torchtext.data import Field
from torchtext.vocab import GloVe, Vectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sentence_transformers import SentenceTransformer

import warnings
warnings.filterwarnings('ignore')

Ver [Trained Models & Pipelines](https://spacy.io/models) para los modelos de SpaCy.

In [2]:
en = spacy.load('en_core_web_md')

def tokenize_en(sentence):
    return [tok.text for tok in en.tokenizer(sentence)]

es = spacy.load('es_core_news_md')

def tokenize_es(sentence):
    return [tok.text for tok in es.tokenizer(sentence)]

In [3]:
# Descargamos los embeddings
GloVe(name='twitter.27B', dim=200)

<torchtext.vocab.GloVe at 0x7f8d2cf7a9e8>

## Datos

In [4]:
test = pd.read_csv('../../Data/EXIST2021_test.tsv', sep='\t')
test[test['language'] == 'en'].shape

(2208, 7)

In [5]:
train = pd.read_csv('../../Data/EXIST2021_training.tsv', sep='\t')
# train = pd.read_csv('../../Data/EXIST2021_training_spell_checked.csv', sep=',')
train = train[train['task1'] == 'sexist']

# Un simple pre-procesamiento
train['text'] = train['text'].apply(lambda text: preprocessing.preprocess(text))

train_en = train[train['language'] == 'en']
train_es = train[train['language'] == 'es']

train.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2
0,EXIST2021,1,twitter,en,"she calls herself "" anti-feminazi "" how about ...",sexist,ideological-inequality
2,EXIST2021,3,twitter,en,"wow , your skirt is very short . what is it's ...",sexist,objectification
5,EXIST2021,6,twitter,en,hello ... m raj ... m with good size and excel...,sexist,sexual-violence
10,EXIST2021,11,twitter,en,you weren't actually what ? this is not a pain...,sexist,ideological-inequality
15,EXIST2021,16,twitter,en,i don't know any feminists who think the way y...,sexist,ideological-inequality


In [6]:
test = pd.read_csv('../../Data/EXIST2021_test.tsv', sep='\t')
# test = pd.read_csv('../../Data/EXIST2021_test_spell_checked.csv', sep=',')
test = test[test['task1'] == 'sexist']


# Un simple pre-procesamiento
test['text'] = test['text'].apply(lambda text: preprocessing.preprocess(text))

test_en = test[test['language'] == 'en']
test_es = test[test['language'] == 'es']

test.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2
2,EXIST2021,6980,twitter,en,"lol ! "" this behavior of not letting men tell ...",sexist,ideological-inequality
3,EXIST2021,6981,twitter,en,rights ? i mean yeah most women especially the...,sexist,ideological-inequality
7,EXIST2021,6985,twitter,en,stop regarding women as animals who forget tht...,sexist,ideological-inequality
8,EXIST2021,6986,gab,en,"yeah , it is rough , but not for women . marri...",sexist,objectification
11,EXIST2021,6989,twitter,en,you were publicly harassing a girl by constant...,sexist,misogyny-non-sexual-violence


In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(train['task2'])

target_names = label_encoder.classes_

In [8]:
text = '@AurelieGuiboud Incredible! AAAA :D Beautiful!But I laughed sooooooo much when I read about you drifting in your wheelchair.I can just picture it  https://t.co/uvl5HhbmbR lol'
preprocessing.preprocess(text)

'incredible ! aaa :D beautiful ! but i laughed sooo much when i read about you drifting in your wheelchair . i can just picture it lol'

## Modelos

### Baseline (tf-idf)

#### English

In [9]:
# Definimos el vectorizer
vectorizer_en = TfidfVectorizer(analyzer='word', stop_words=None, lowercase=True)
vectorizer_en.fit(train_en['text'])

# Transformamos
X_train_en = vectorizer_en.transform(train_en['text'])
X_test_en = vectorizer_en.transform(test_en['text'])

y_train_en = label_encoder.transform(train_en['task2'])
y_test_en = label_encoder.transform(test_en['task2'])

In [10]:
clf_en = LogisticRegression(max_iter=1000)
%time clf_en.fit(X_train_en, y_train_en)

y_pred_en = clf_en.predict(X_test_en)

print(classification_report(y_test_en, y_pred_en, target_names=target_names))
print(f'F1-score: {round(f1_score(y_test_en, y_pred_en, average="macro"), 4)}')

CPU times: user 2.77 s, sys: 7.58 s, total: 10.3 s
Wall time: 679 ms
                              precision    recall  f1-score   support

      ideological-inequality       0.61      0.76      0.68       333
misogyny-non-sexual-violence       0.59      0.42      0.49       215
             objectification       0.63      0.32      0.42       150
             sexual-violence       0.60      0.61      0.61       198
      stereotyping-dominance       0.49      0.58      0.53       262

                    accuracy                           0.58      1158
                   macro avg       0.58      0.54      0.55      1158
                weighted avg       0.58      0.58      0.57      1158

F1-score: 0.5464


#### Spanish

In [11]:
# Definimos el vectorizer
vectorizer_es = TfidfVectorizer(analyzer='word', stop_words=None, lowercase=True)
vectorizer_es.fit(train_es['text'])

# Transformamos
X_train_es = vectorizer_es.transform(train_es['text'])
X_test_es = vectorizer_es.transform(test_es['text'])

y_train_es = label_encoder.transform(train_es['task2'])
y_test_es = label_encoder.transform(test_es['task2'])

In [12]:
clf_es = LogisticRegression(max_iter=1000)
%time clf_es.fit(X_train_es, y_train_es)

y_pred_es = clf_es.predict(X_test_es)

print(classification_report(y_test_es, y_pred_es, target_names=target_names))
print(f'F1-score: {round(f1_score(y_test_es, y_pred_es, average="macro"), 4)}')

CPU times: user 2.69 s, sys: 7.01 s, total: 9.7 s
Wall time: 707 ms
                              precision    recall  f1-score   support

      ideological-inequality       0.56      0.85      0.68       288
misogyny-non-sexual-violence       0.54      0.52      0.53       257
             objectification       0.65      0.33      0.44       174
             sexual-violence       0.92      0.24      0.38       202
      stereotyping-dominance       0.43      0.63      0.51       202

                    accuracy                           0.55      1123
                   macro avg       0.62      0.52      0.51      1123
                weighted avg       0.61      0.55      0.52      1123

F1-score: 0.5077


#### Total

In [13]:
y_test = np.hstack((y_test_en, y_test_es))
y_pred = np.hstack((y_pred_en, y_pred_es))

print(classification_report(y_test, y_pred, target_names=target_names))
print(f'F1-score: {round(f1_score(y_test, y_pred, average="macro"), 4)}')

                              precision    recall  f1-score   support

      ideological-inequality       0.58      0.80      0.68       621
misogyny-non-sexual-violence       0.56      0.47      0.51       472
             objectification       0.64      0.33      0.43       324
             sexual-violence       0.67      0.42      0.52       400
      stereotyping-dominance       0.46      0.61      0.52       464

                    accuracy                           0.56      2281
                   macro avg       0.58      0.53      0.53      2281
                weighted avg       0.58      0.56      0.55      2281

F1-score: 0.533


### Promedio de vectores de palabras con GloVe

Para los embeddings en inglés usaré los que están pre-entrenados con el corpus de Twitter, pues los datos de esta tarea también son de Twitter.

Debido a que Torchtext sólo tiene por defecto embeddings en inglés hay que hacer otras cosas para cargar los que están en español. Primero, hay que descargarlos de [GloVe Spanish](http://dcc.uchile.cl/~jperez/word-embeddings/glove-sbwc.i25.vec.gz) y ponerlos en la carpeta `.vector_cache` del directorio actual. Para ver otros embeddings pre-entrenados en español ver [spanish-word-embeddings](https://github.com/dccuchile/spanish-word-embeddings).

Ver lo siguiente para algunos detalles de cómo cargarlos para la capa de embeddings:

- [Use pretrained embedding in Spanish with Torchtext](https://stackoverflow.com/questions/52224555/use-pretrained-embedding-in-spanish-with-torchtext)
- [Handling German Text with torchtext](https://www.innoq.com/en/blog/handling-german-text-with-torchtext/)

In [None]:
!wget http://dcc.uchile.cl/~jperez/word-embeddings/glove-sbwc.i25.vec.gz -P .vector_cache
!gunzip .vector_cache/glove-sbwc.i25.vec.gz

In [14]:
def mean_vector(text, text_field, vocab):
    """
    Promedia los vectores de palabras de un texto.
    """
    vectors = np.array([vocab.vectors[vocab[token]].numpy() for token in text_field.preprocess(text)])
    return np.mean(vectors, axis=0)

#### English

In [15]:
# Definimos cómo se preprocesará el texto
text_field_en = Field(tokenize=tokenize_en, lower=True)

# Preprocesamos el texto
preprocessed_train_text_en = train_en['text'].apply(lambda x: text_field_en.preprocess(x))
preprocessed_test_text_en = test_en['text'].apply(lambda x: text_field_en.preprocess(x))

# Contruimos el vocabulario
text_field_en.build_vocab(preprocessed_train_text_en, vectors='glove.twitter.27B.200d', vectors_cache='.vector_cache')
vocab_en = text_field_en.vocab
vocab_en.freqs.most_common(10)

[('.', 1812),
 ('a', 1275),
 ('the', 1206),
 ('to', 1190),
 (',', 1174),
 ('you', 1076),
 ('i', 995),
 ('and', 964),
 ('#', 855),
 ('’', 745)]

In [16]:
X_train_en = np.zeros(shape=(train_en.shape[0], 200))

for i, text in tqdm(enumerate(train_en['text']), total=train_en.shape[0]):
    X_train_en[i, :] = mean_vector(text, text_field_en, vocab_en)
    
y_train_en = label_encoder.transform(train_en['task2'])

100%|██████████| 1636/1636 [00:00<00:00, 6451.85it/s]


In [17]:
X_test_en = np.zeros(shape=(test_en.shape[0], 200))

for i, text in tqdm(enumerate(test_en['text']), total=test_en.shape[0]):
    X_test_en[i, :] = mean_vector(text, text_field_en, vocab_en)
    
y_test_en = label_encoder.transform(test_en['task2'])

100%|██████████| 1158/1158 [00:00<00:00, 5692.53it/s]


In [18]:
clf_en = LogisticRegression(max_iter=1000)
%time clf_en.fit(X_train_en, y_train_en)

y_pred_en = clf_en.predict(X_test_en)

print(classification_report(y_test_en, y_pred_en, target_names=target_names))
print(f'F1-score: {round(f1_score(y_test_en, y_pred_en, average="macro"), 4)}')

CPU times: user 1.75 s, sys: 3.35 s, total: 5.09 s
Wall time: 346 ms
                              precision    recall  f1-score   support

      ideological-inequality       0.58      0.68      0.63       333
misogyny-non-sexual-violence       0.54      0.35      0.42       215
             objectification       0.58      0.46      0.51       150
             sexual-violence       0.57      0.63      0.60       198
      stereotyping-dominance       0.47      0.52      0.50       262

                    accuracy                           0.55      1158
                   macro avg       0.55      0.53      0.53      1158
                weighted avg       0.55      0.55      0.54      1158

F1-score: 0.5319


#### Spanish

In [19]:
vectors_es = Vectors('glove-sbwc.i25.vec', cache='.vector_cache')

In [20]:
# Definimos cómo se preprocesará el texto
text_field_es = Field(tokenize=tokenize_es, lower=True)

# Preprocesamos el texto
preprocessed_train_text_es = train_es['text'].apply(lambda x: text_field_es.preprocess(x))
preprocessed_test_text_es = test_es['text'].apply(lambda x: text_field_es.preprocess(x))

# Contruimos el vocabulario
text_field_es.build_vocab(preprocessed_train_text_es, vectors=vectors_es)
vocab_es = text_field_es.vocab
vocab_es.freqs.most_common(10)

[(',', 2169),
 ('que', 2064),
 ('de', 1732),
 ('.', 1412),
 ('la', 1298),
 ('a', 1253),
 ('y', 1227),
 ('no', 1095),
 ('el', 927),
 ('las', 851)]

In [21]:
X_train_es = np.zeros(shape=(train_es.shape[0], 300))

for i, text in tqdm(enumerate(train_es['text']), total=train_es.shape[0]):
    X_train_es[i, :] = mean_vector(text, text_field_es, vocab_es)
    
y_train_es = label_encoder.transform(train_es['task2'])

100%|██████████| 1741/1741 [00:00<00:00, 7682.87it/s]


In [22]:
X_train_es = np.zeros(shape=(train_es.shape[0], 300))

for i, text in tqdm(enumerate(train_es['text']), total=train_es.shape[0]):
    X_train_es[i, :] = mean_vector(text, text_field_es, vocab_es)
    
y_train_es = label_encoder.transform(train_es['task2'])

100%|██████████| 1741/1741 [00:00<00:00, 7768.35it/s]


In [23]:
X_test_es = np.zeros(shape=(test_es.shape[0], 300))

for i, text in tqdm(enumerate(test_es['text']), total=test_es.shape[0]):
    X_test_es[i, :] = mean_vector(text, text_field_es, vocab_es)
    
y_test_es = label_encoder.transform(test_es['task2'])

100%|██████████| 1123/1123 [00:00<00:00, 7192.67it/s]


In [24]:
clf_es = LogisticRegression(max_iter=1000)
%time clf_es.fit(X_train_es, y_train_es)

y_pred_es = clf_es.predict(X_test_es)

print(classification_report(y_test_es, y_pred_es))
print(f'F1-score: {round(f1_score(y_test_es, y_pred_es, average="macro"), 4)}')

CPU times: user 2.1 s, sys: 3.73 s, total: 5.82 s
Wall time: 378 ms
              precision    recall  f1-score   support

           0       0.55      0.81      0.65       288
           1       0.47      0.52      0.49       257
           2       0.60      0.32      0.41       174
           3       0.78      0.23      0.35       202
           4       0.44      0.57      0.50       202

    accuracy                           0.52      1123
   macro avg       0.57      0.49      0.48      1123
weighted avg       0.56      0.52      0.50      1123

F1-score: 0.4818


#### Total

In [25]:
y_test = np.hstack((y_test_en, y_test_es))
y_pred = np.hstack((y_pred_en, y_pred_es))

print(classification_report(y_test, y_pred, target_names=target_names))
print(f'F1-score: {round(f1_score(y_test, y_pred, average="macro"), 4)}')

                              precision    recall  f1-score   support

      ideological-inequality       0.56      0.74      0.64       621
misogyny-non-sexual-violence       0.50      0.44      0.47       472
             objectification       0.59      0.38      0.46       324
             sexual-violence       0.61      0.42      0.50       400
      stereotyping-dominance       0.46      0.54      0.50       464

                    accuracy                           0.53      2281
                   macro avg       0.54      0.51      0.51      2281
                weighted avg       0.54      0.53      0.53      2281

F1-score: 0.5139


### Doc2Vec

#### English

Al igual que los vectores usados en GloVe haré que los embeddings the Doc2Vec tengan dimesión igual a 200.

In [26]:
documents_en = [TaggedDocument(doc, [i]) for doc, i in zip(preprocessed_train_text_en, train_en['task2'])]
%time doc2vec_en = Doc2Vec(documents_en, vector_size=200, window=5, min_count=1, workers=4, seed=42, epochs=50)

CPU times: user 8.54 s, sys: 819 ms, total: 9.36 s
Wall time: 5.19 s


In [27]:
X_train_en = np.zeros(shape=(train_en.shape[0], 200))

for i, text in tqdm(enumerate(train_en['text']), total=train_en.shape[0]):
    X_train_en[i, :] = doc2vec_en.infer_vector(text_field_en.preprocess(text))
    
y_train_en = label_encoder.transform(train_en['task2'])

100%|██████████| 1636/1636 [00:03<00:00, 437.41it/s]


In [28]:
X_test_en = np.zeros(shape=(test_en.shape[0], 200))

for i, text in tqdm(enumerate(test_en['text']), total=test_en.shape[0]):
    X_test_en[i, :] = doc2vec_en.infer_vector(text_field_en.preprocess(text))
    
y_test_en = label_encoder.transform(test_en['task2'])

100%|██████████| 1158/1158 [00:02<00:00, 452.84it/s]


In [29]:
clf_en = LogisticRegression(max_iter=1000)
%time clf_en.fit(X_train_en, y_train_en)

y_pred_en = clf_en.predict(X_test_en)

print(classification_report(y_test_en, y_pred_en, target_names=target_names))
print(f'F1-score: {round(f1_score(y_test_en, y_pred_en, average="macro"), 4)}')

CPU times: user 624 ms, sys: 1.04 s, total: 1.67 s
Wall time: 114 ms
                              precision    recall  f1-score   support

      ideological-inequality       0.54      0.45      0.49       333
misogyny-non-sexual-violence       0.35      0.39      0.37       215
             objectification       0.29      0.28      0.29       150
             sexual-violence       0.38      0.38      0.38       198
      stereotyping-dominance       0.37      0.42      0.40       262

                    accuracy                           0.40      1158
                   macro avg       0.39      0.38      0.38      1158
                weighted avg       0.41      0.40      0.40      1158

F1-score: 0.3842


#### Spanish
Al igual que los vectores usados en GloVe haré que los embeddings the Doc2Vec tengan dimesión igual a 300.

In [30]:
documents_es = [TaggedDocument(doc, [i]) for doc, i in zip(preprocessed_train_text_es, train_es['task2'])]
%time doc2vec_es = Doc2Vec(documents_es, vector_size=300, window=5, min_count=1, workers=4, seed=42, epochs=50)

CPU times: user 9.67 s, sys: 1.48 s, total: 11.1 s
Wall time: 5.66 s


In [31]:
X_train_es = np.zeros(shape=(train_es.shape[0], 300))

for i, text in tqdm(enumerate(train_es['text']), total=train_es.shape[0]):
    X_train_es[i, :] = doc2vec_es.infer_vector(text_field_es.preprocess(text))
    
y_train_es = label_encoder.transform(train_es['task2'])

100%|██████████| 1741/1741 [00:03<00:00, 438.54it/s]


In [32]:
X_test_es = np.zeros(shape=(test_es.shape[0], 300))

for i, text in tqdm(enumerate(test_es['text']), total=test_es.shape[0]):
    X_test_es[i, :] = doc2vec_es.infer_vector(text_field_es.preprocess(text))
    
y_test_es = label_encoder.transform(test_es['task2'])

100%|██████████| 1123/1123 [00:02<00:00, 475.47it/s]


In [33]:
clf_es = LogisticRegression(max_iter=1000)
%time clf_es.fit(X_train_es, y_train_es)

y_pred_es = clf_es.predict(X_test_es)

print(classification_report(y_test_es, y_pred_es, target_names=target_names))
print(f'F1-score: {round(f1_score(y_test_es, y_pred_es, average="macro"), 4)}')

CPU times: user 735 ms, sys: 1.28 s, total: 2.02 s
Wall time: 134 ms
                              precision    recall  f1-score   support

      ideological-inequality       0.47      0.47      0.47       288
misogyny-non-sexual-violence       0.34      0.42      0.38       257
             objectification       0.32      0.37      0.35       174
             sexual-violence       0.47      0.24      0.32       202
      stereotyping-dominance       0.31      0.34      0.32       202

                    accuracy                           0.38      1123
                   macro avg       0.38      0.37      0.37      1123
                weighted avg       0.39      0.38      0.37      1123

F1-score: 0.3658


#### Total

In [34]:
y_test = np.hstack((y_test_en, y_test_es))
y_pred = np.hstack((y_pred_en, y_pred_es))

print(classification_report(y_test, y_pred, target_names=target_names))
print(f'F1-score: {round(f1_score(y_test, y_pred, average="macro"), 4)}')

                              precision    recall  f1-score   support

      ideological-inequality       0.51      0.46      0.48       621
misogyny-non-sexual-violence       0.34      0.41      0.37       472
             objectification       0.31      0.33      0.32       324
             sexual-violence       0.41      0.31      0.35       400
      stereotyping-dominance       0.34      0.39      0.36       464

                    accuracy                           0.39      2281
                   macro avg       0.38      0.38      0.38      2281
                weighted avg       0.40      0.39      0.39      2281

F1-score: 0.3777


### sentence-BERT

En [Pretrained Models](https://www.sbert.net/docs/pretrained_models.html) están todos los modelos preentrenados que hay. Los modelos fuertes en una tarea, serán débiles para otra tarea, por lo tanto, es importante seleccionar el modelo adecuado para cada tarea. Como no hay ninguno en específico para la tarea de análisis de sentimientos usaré **paraphrase-distilroberta-base-v1**, el cual recomiendan para varias aplicaciones.

**distiluse-base-multilingual-cased-v1:** Multilingual knowledge distilled version of multilingual Universal Sentence Encoder. Supports 15 languages: Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish.

In [35]:
# Descagamos el modelo
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

#### English

In [36]:
sentences_train_en = [' '.join(review) for review in preprocessed_train_text_en]
sentences_test_en = [' '.join(review) for review in preprocessed_test_text_en]

In [37]:
# Cada embedding tiene dimensión igual a 768
%time X_train_en = model.encode(sentences_train_en, show_progress_bar=True)
%time X_test_en = model.encode(sentences_test_en, show_progress_bar=True)

y_train_en = label_encoder.transform(train_en['task2'])
y_test_en = label_encoder.transform(test_en['task2'])

Batches:   0%|          | 0/52 [00:00<?, ?it/s]

CPU times: user 3.37 s, sys: 442 ms, total: 3.81 s
Wall time: 3.29 s


Batches:   0%|          | 0/37 [00:00<?, ?it/s]

CPU times: user 1.51 s, sys: 152 ms, total: 1.66 s
Wall time: 1.25 s


In [38]:
clf_en = LogisticRegression(max_iter=1000)
%time clf_en.fit(X_train_en, y_train_en)

y_pred_en = clf_en.predict(X_test_en)

print(classification_report(y_test_en, y_pred_en, target_names=target_names))
print(f'F1-score: {round(f1_score(y_test_en, y_pred_en, average="macro"), 4)}')

CPU times: user 671 ms, sys: 1.13 s, total: 1.8 s
Wall time: 123 ms
                              precision    recall  f1-score   support

      ideological-inequality       0.68      0.69      0.69       333
misogyny-non-sexual-violence       0.54      0.44      0.48       215
             objectification       0.66      0.49      0.56       150
             sexual-violence       0.61      0.68      0.64       198
      stereotyping-dominance       0.57      0.69      0.62       262

                    accuracy                           0.61      1158
                   macro avg       0.61      0.60      0.60      1158
                weighted avg       0.62      0.61      0.61      1158

F1-score: 0.5999


#### Spanish

In [39]:
sentences_train_es = [' '.join(review) for review in preprocessed_train_text_es]
sentences_test_es = [' '.join(review) for review in preprocessed_test_text_es]

In [40]:
# Cada embedding tiene dimensión igual a 768
%time X_train_es = model.encode(sentences_train_es, show_progress_bar=True)
%time X_test_es = model.encode(sentences_test_es, show_progress_bar=True)

y_train_es = label_encoder.transform(train_es['task2'])
y_test_es = label_encoder.transform(test_es['task2'])

Batches:   0%|          | 0/55 [00:00<?, ?it/s]

CPU times: user 2.15 s, sys: 99.9 ms, total: 2.25 s
Wall time: 1.68 s


Batches:   0%|          | 0/36 [00:00<?, ?it/s]

CPU times: user 1.44 s, sys: 56.8 ms, total: 1.5 s
Wall time: 1.12 s


In [41]:
clf_es = LogisticRegression(max_iter=1000)
%time clf_es.fit(X_train_es, y_train_es)

y_pred_es = clf_es.predict(X_test_es)

print(classification_report(y_test_es, y_pred_es, target_names=target_names))
print(f'F1-score: {round(f1_score(y_test_es, y_pred_es, average="macro"), 4)}')

CPU times: user 769 ms, sys: 1.57 s, total: 2.34 s
Wall time: 156 ms
                              precision    recall  f1-score   support

      ideological-inequality       0.68      0.84      0.75       288
misogyny-non-sexual-violence       0.52      0.60      0.56       257
             objectification       0.70      0.48      0.57       174
             sexual-violence       0.87      0.49      0.63       202
      stereotyping-dominance       0.51      0.60      0.55       202

                    accuracy                           0.62      1123
                   macro avg       0.66      0.60      0.61      1123
                weighted avg       0.65      0.62      0.62      1123

F1-score: 0.6108


#### Total

In [42]:
y_test = np.hstack((y_test_en, y_test_es))
y_pred = np.hstack((y_pred_en, y_pred_es))

print(classification_report(y_test, y_pred, target_names=target_names))
print(f'F1-score: {round(f1_score(y_test, y_pred, average="macro"), 4)}')

                              precision    recall  f1-score   support

      ideological-inequality       0.68      0.76      0.72       621
misogyny-non-sexual-violence       0.53      0.53      0.53       472
             objectification       0.68      0.48      0.57       324
             sexual-violence       0.70      0.58      0.63       400
      stereotyping-dominance       0.55      0.65      0.59       464

                    accuracy                           0.62      2281
                   macro avg       0.63      0.60      0.61      2281
                weighted avg       0.62      0.62      0.62      2281

F1-score: 0.6076


## Resultados

De los distintos modelos el mejor parece ser el de BERT que ha sido entrenado con varios idiomas.

<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-baqh{text-align:center;vertical-align:top}
.tg .tg-amwm{font-weight:bold;text-align:center;vertical-align:top}
</style>
<table class="tg">
<thead>
  <tr>
    <th class="tg-amwm" rowspan="2">Model</th>
    <th class="tg-amwm" colspan="3">Task-2 (F1 score)</th>
  </tr>
  <tr>
    <td class="tg-baqh">English</td>
    <td class="tg-baqh">Spanish</td>
    <td class="tg-baqh">Total</td>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-baqh">tf-idf</td>
    <td class="tg-baqh">54.64</td>
    <td class="tg-baqh">50.77</td>
    <td class="tg-baqh">53.30</td>
  </tr>
  <tr>
    <td class="tg-baqh">GloVe</td>
    <td class="tg-baqh">53.19</td>
    <td class="tg-baqh">48.18</td>
    <td class="tg-baqh">51.39</td>
  </tr>
  <tr>
    <td class="tg-baqh">Doc2Vec</td>
    <td class="tg-baqh">38.18</td>
    <td class="tg-baqh">37.87</td>
    <td class="tg-baqh">38.41</td>
  </tr>
  <tr>
    <td class="tg-baqh">sentence-BERT</td>
    <td class="tg-baqh">59.99</td>
    <td class="tg-baqh">61.08</td>
    <td class="tg-baqh">60.76</td>
  </tr>
</tbody>
</table>
<!-- - Al hacer la clasificación con el spell-check se obtiene:

<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-baqh{text-align:center;vertical-align:top}
.tg .tg-amwm{font-weight:bold;text-align:center;vertical-align:top}
</style>
<table class="tg">
<thead>
  <tr>
    <th class="tg-amwm" rowspan="2">Model</th>
    <th class="tg-amwm" colspan="3">Task-2 (F1 score)</th>
  </tr>
  <tr>
    <td class="tg-baqh">English</td>
    <td class="tg-baqh">Spanish</td>
    <td class="tg-baqh">Total</td>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-baqh">tf-idf</td>
    <td class="tg-baqh">35.80</td>
    <td class="tg-baqh">34.92</td>
    <td class="tg-baqh">35.89</td>
  </tr>
  <tr>
    <td class="tg-baqh">GloVe</td>
    <td class="tg-baqh">35.39</td>
    <td class="tg-baqh">34.46</td>
    <td class="tg-baqh">35.45</td>
  </tr>
  <tr>
    <td class="tg-baqh">Doc2Vec</td>
    <td class="tg-baqh">29.96</td>
    <td class="tg-baqh">28.43</td>
    <td class="tg-baqh">29.46</td>
  </tr>
  <tr>
    <td class="tg-baqh">sentence-BERT</td>
    <td class="tg-baqh">46.09</td>
    <td class="tg-baqh">42.10</td>
    <td class="tg-baqh">44.72</td>
  </tr>
</tbody>
</table> -->

- En realidad parece empeorar un poco con el spell-check, además de que es algo tardado hacerlo, por lo que mejor no usaré el spell-check. 
- Por otro lado, hacer el pre-procesamiento parece ayudar un poco.