# Modelos básicos

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
import seaborn as sns

import re
from nltk.tokenize import TweetTokenizer
import spacy
from utils import preprocessing

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

from torchtext.data import Field
from torchtext.vocab import GloVe, Vectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sentence_transformers import SentenceTransformer

Ver [Trained Models & Pipelines](https://spacy.io/models) para los modelos de SpaCy.

In [2]:
en = spacy.load('en_core_web_md')

def tokenize_en(sentence):
    return [tok.text for tok in en.tokenizer(sentence)]

es = spacy.load('es_core_news_md')

def tokenize_es(sentence):
    return [tok.text for tok in es.tokenizer(sentence)]

In [3]:
# Descargamos los embeddings
GloVe(name='twitter.27B', dim=200)

<torchtext.vocab.GloVe at 0x7f742c378dd8>

## Datos

In [4]:
train = pd.read_csv('../../Data/EXIST2021_training.tsv', sep='\t')
# train = pd.read_csv('../../Data/EXIST2021_training_spell_checked.csv', sep=',')

# Un simple pre-procesamiento
train['text'] = train['text'].apply(lambda text: preprocessing.preprocess(text))

train_en = train[train['language'] == 'en']
train_es = train[train['language'] == 'es']

train.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2
0,EXIST2021,1,twitter,en,"she calls herself "" anti - feminazi "" how abou...",sexist,ideological-inequality
1,EXIST2021,2,twitter,en,"now , back to these women , the brave and the ...",non-sexist,non-sexist
2,EXIST2021,3,twitter,en,"@ curvybandida @ lynne _ i wow , your skirt is...",sexist,objectification
3,EXIST2021,4,twitter,en,@ aurelieguiboud incredible ! beautiful ! but ...,non-sexist,non-sexist
4,EXIST2021,5,twitter,en,i find it extremely hard to believe that kelly...,non-sexist,non-sexist


In [5]:
test = pd.read_csv('../../Data/EXIST2021_test.tsv', sep='\t')
# test = pd.read_csv('../../Data/EXIST2021_test_spell_checked.csv', sep=',')

# Un simple pre-procesamiento
test['text'] = test['text'].apply(lambda text: preprocessing.preprocess(text))

test_en = test[test['language'] == 'en']
test_es = test[test['language'] == 'es']

test.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2
0,EXIST2021,6978,gab,en,pennsylvania state rep horrifies with opening ...,non-sexist,non-sexist
1,EXIST2021,6979,twitter,en,"@ iilovegrapes he sounds like as ass , and ver...",non-sexist,non-sexist
2,EXIST2021,6980,twitter,en,"@ averyangryskel 1 @ 4arealistparty lol ! "" th...",sexist,ideological-inequality
3,EXIST2021,6981,twitter,en,@ wanderorange @ stalliontwink rights ? i mean...,sexist,ideological-inequality
4,EXIST2021,6982,twitter,en,the jack manifold appreciation i'm seeing is o...,non-sexist,non-sexist


In [6]:
label_encoder = LabelEncoder()
label_encoder.fit(train['task1'])
label_encoder.classes_

array(['non-sexist', 'sexist'], dtype=object)

In [7]:
text = '@AurelieGuiboud Incredible! AAAA :D Beautiful!But I laughed sooooooo much when I read about you drifting in your wheelchair.I can just picture it  https://t.co/uvl5HhbmbR lol'
preprocessing.preprocess(text)

'incredible ! aaa :D beautiful ! but i laughed sooo much when i read about you drifting in your wheelchair . i can just picture it lol'

## Modelos

### Baseline (tf-idf)

#### English

In [8]:
# Definimos el vectorizer
vectorizer_en = TfidfVectorizer(analyzer='word', stop_words=None, lowercase=True)
vectorizer_en.fit(train_en['text'])

# Transformamos
X_train_en = vectorizer_en.transform(train_en['text'])
X_test_en = vectorizer_en.transform(test_en['text'])

y_train_en = label_encoder.transform(train_en['task1'])
y_test_en = label_encoder.transform(test_en['task1'])

In [9]:
clf_en = LogisticRegression(max_iter=1000)
%time clf_en.fit(X_train_en, y_train_en)

y_pred_en = clf_en.predict(X_test_en)

print(classification_report(y_test_en, y_pred_en, target_names=['non-sexist', 'sexist']))
print(f'Accuracy: {round(accuracy_score(y_test_en, y_pred_en), 4)}')

CPU times: user 720 ms, sys: 2.11 s, total: 2.83 s
Wall time: 190 ms
              precision    recall  f1-score   support

  non-sexist       0.67      0.74      0.70      1050
      sexist       0.74      0.66      0.70      1158

    accuracy                           0.70      2208
   macro avg       0.70      0.70      0.70      2208
weighted avg       0.71      0.70      0.70      2208

Accuracy: 0.7011


#### Spanish

In [10]:
# Definimos el vectorizer
vectorizer_es = TfidfVectorizer(analyzer='word', stop_words=None, lowercase=True)
vectorizer_es.fit(train_es['text'])

# Transformamos
X_train_es = vectorizer_es.transform(train_es['text'])
X_test_es = vectorizer_es.transform(test_es['text'])

y_train_es = label_encoder.transform(train_es['task1'])
y_test_es = label_encoder.transform(test_es['task1'])

In [11]:
clf_es = LogisticRegression(max_iter=1000)
%time clf_es.fit(X_train_es, y_train_es)

y_pred_es = clf_es.predict(X_test_es)

print(classification_report(y_test_es, y_pred_es, target_names=['non-sexist', 'sexist']))
print(f'Accuracy: {round(accuracy_score(y_test_es, y_pred_es), 4)}')

CPU times: user 482 ms, sys: 1.1 s, total: 1.59 s
Wall time: 113 ms
              precision    recall  f1-score   support

  non-sexist       0.66      0.80      0.72      1037
      sexist       0.77      0.62      0.69      1123

    accuracy                           0.71      2160
   macro avg       0.72      0.71      0.70      2160
weighted avg       0.72      0.71      0.70      2160

Accuracy: 0.706


#### Total

In [12]:
y_test = np.hstack((y_test_en, y_test_es))
y_pred = np.hstack((y_pred_en, y_pred_es))

print(classification_report(y_test, y_pred, target_names=['non-sexist', 'sexist']))
print(f'Accuracy: {round(accuracy_score(y_test, y_pred), 4)}')

              precision    recall  f1-score   support

  non-sexist       0.66      0.77      0.71      2087
      sexist       0.75      0.64      0.69      2281

    accuracy                           0.70      4368
   macro avg       0.71      0.71      0.70      4368
weighted avg       0.71      0.70      0.70      4368

Accuracy: 0.7035


### Promedio de vectores de palabras con GloVe

Para los embeddings en inglés usaré los que están pre-entrenados con el corpus de Twitter, pues los datos de esta tarea también son de Twitter.

Debido a que Torchtext sólo tiene por defecto embeddings en inglés hay que hacer otras cosas para cargar los que están en español. Primero, hay que descargarlos de [GloVe Spanish](http://dcc.uchile.cl/~jperez/word-embeddings/glove-sbwc.i25.vec.gz) y ponerlos en la carpeta `.vector_cache` del directorio actual. Para ver otros embeddings pre-entrenados en español ver [spanish-word-embeddings](https://github.com/dccuchile/spanish-word-embeddings).

Ver lo siguiente para algunos detalles de cómo cargarlos para la capa de embeddings:

- [Use pretrained embedding in Spanish with Torchtext](https://stackoverflow.com/questions/52224555/use-pretrained-embedding-in-spanish-with-torchtext)
- [Handling German Text with torchtext](https://www.innoq.com/en/blog/handling-german-text-with-torchtext/)

In [13]:
def mean_vector(text, text_field, vocab):
    """
    Promedia los vectores de palabras de un texto.
    """
    vectors = np.array([vocab.vectors[vocab[token]].numpy() for token in text_field.preprocess(text)])
    return np.mean(vectors, axis=0)

#### English

In [14]:
# Definimos cómo se preprocesará el texto
text_field_en = Field(tokenize=tokenize_en, lower=True)

# Preprocesamos el texto
preprocessed_train_text_en = train_en['text'].apply(lambda x: text_field_en.preprocess(x))
preprocessed_test_text_en = test_en['text'].apply(lambda x: text_field_en.preprocess(x))

# Contruimos el vocabulario
text_field_en.build_vocab(preprocessed_train_text_en, vectors='glove.twitter.27B.200d', vectors_cache='.vector_cache')
vocab_en = text_field_en.vocab
vocab_en.freqs.most_common(10)



[('.', 6696),
 ('i', 5017),
 ('@', 4287),
 ('/', 3123),
 ('the', 2767),
 ('to', 2541),
 (',', 2518),
 ('a', 2438),
 ('and', 2033),
 ('#', 2022)]

In [15]:
X_train_en = np.zeros(shape=(train_en.shape[0], 200))

for i, text in tqdm(enumerate(train_en['text']), total=train_en.shape[0]):
    X_train_en[i, :] = mean_vector(text, text_field_en, vocab_en)
    
y_train_en = label_encoder.transform(train_en['task1'])

100%|██████████| 3436/3436 [00:00<00:00, 5866.75it/s]


In [16]:
X_test_en = np.zeros(shape=(test_en.shape[0], 200))

for i, text in tqdm(enumerate(test_en['text']), total=test_en.shape[0]):
    X_test_en[i, :] = mean_vector(text, text_field_en, vocab_en)
    
y_test_en = label_encoder.transform(test_en['task1'])

100%|██████████| 2208/2208 [00:00<00:00, 5375.46it/s]


In [17]:
clf_en = LogisticRegression(max_iter=1000)
%time clf_en.fit(X_train_en, y_train_en)

y_pred_en = clf_en.predict(X_test_en)

print(classification_report(y_test_en, y_pred_en, target_names=['non-sexist', 'sexist']))
print(f'Accuracy: {round(accuracy_score(y_test_en, y_pred_en), 4)}')

CPU times: user 373 ms, sys: 804 ms, total: 1.18 s
Wall time: 81.5 ms
              precision    recall  f1-score   support

  non-sexist       0.63      0.74      0.68      1050
      sexist       0.72      0.60      0.65      1158

    accuracy                           0.67      2208
   macro avg       0.67      0.67      0.67      2208
weighted avg       0.68      0.67      0.67      2208

Accuracy: 0.6685


#### Spanish

In [18]:
vectors_es = Vectors('glove-sbwc.i25.vec', cache='.vector_cache')

In [19]:
# Definimos cómo se preprocesará el texto
text_field_es = Field(tokenize=tokenize_es, lower=True)

# Preprocesamos el texto
preprocessed_train_text_es = train_es['text'].apply(lambda x: text_field_es.preprocess(x))
preprocessed_test_text_es = test_es['text'].apply(lambda x: text_field_es.preprocess(x))

# Contruimos el vocabulario
text_field_es.build_vocab(preprocessed_train_text_es, vectors=vectors_es)
vocab_es = text_field_es.vocab
vocab_es.freqs.most_common(10)



[('.', 6021),
 (',', 4472),
 ('que', 4010),
 ('de', 3845),
 ('@', 3629),
 ('/', 3172),
 ('la', 2813),
 ('a', 2683),
 ('y', 2679),
 ('no', 2171)]

In [20]:
X_train_es = np.zeros(shape=(train_es.shape[0], 300))

for i, text in tqdm(enumerate(train_es['text']), total=train_es.shape[0]):
    X_train_es[i, :] = mean_vector(text, text_field_es, vocab_es)
    
y_train_es = label_encoder.transform(train_es['task1'])

100%|██████████| 3541/3541 [00:00<00:00, 6674.76it/s]


In [21]:
X_train_es = np.zeros(shape=(train_es.shape[0], 300))

for i, text in tqdm(enumerate(train_es['text']), total=train_es.shape[0]):
    X_train_es[i, :] = mean_vector(text, text_field_es, vocab_es)
    
y_train_es = label_encoder.transform(train_es['task1'])

100%|██████████| 3541/3541 [00:00<00:00, 6725.38it/s]


In [22]:
X_test_es = np.zeros(shape=(test_es.shape[0], 300))

for i, text in tqdm(enumerate(test_es['text']), total=test_es.shape[0]):
    X_test_es[i, :] = mean_vector(text, text_field_es, vocab_es)
    
y_test_es = label_encoder.transform(test_es['task1'])

100%|██████████| 2160/2160 [00:00<00:00, 6328.67it/s]


In [23]:
clf_es = LogisticRegression(max_iter=1000)
%time clf_es.fit(X_train_es, y_train_es)

y_pred_es = clf_es.predict(X_test_es)

print(classification_report(y_test_es, y_pred_es))
print(f'Accuracy: {round(accuracy_score(y_test_es, y_pred_es), 4)}')

CPU times: user 570 ms, sys: 1.08 s, total: 1.65 s
Wall time: 111 ms
              precision    recall  f1-score   support

           0       0.63      0.80      0.70      1037
           1       0.75      0.56      0.64      1123

    accuracy                           0.68      2160
   macro avg       0.69      0.68      0.67      2160
weighted avg       0.69      0.68      0.67      2160

Accuracy: 0.6755


#### Total

In [24]:
y_test = np.hstack((y_test_en, y_test_es))
y_pred = np.hstack((y_pred_en, y_pred_es))

print(classification_report(y_test, y_pred, target_names=['non-sexist', 'sexist']))
print(f'Accuracy: {round(accuracy_score(y_test, y_pred), 4)}')

              precision    recall  f1-score   support

  non-sexist       0.63      0.77      0.69      2087
      sexist       0.74      0.58      0.65      2281

    accuracy                           0.67      4368
   macro avg       0.68      0.68      0.67      4368
weighted avg       0.68      0.67      0.67      4368

Accuracy: 0.6719


### Doc2Vec

#### English

Al igual que los vectores usados en GloVe haré que los embeddings the Doc2Vec tengan dimesión igual a 200.

In [25]:
documents_en = [TaggedDocument(doc, [i]) for doc, i in zip(preprocessed_train_text_en, train_en['task1'])]
%time doc2vec_en = Doc2Vec(documents_en, vector_size=200, window=5, min_count=1, workers=4, seed=42, epochs=50)

CPU times: user 21.6 s, sys: 1.76 s, total: 23.4 s
Wall time: 11.9 s


In [26]:
X_train_en = np.zeros(shape=(train_en.shape[0], 200))

for i, text in tqdm(enumerate(train_en['text']), total=train_en.shape[0]):
    X_train_en[i, :] = doc2vec_en.infer_vector(text_field_en.preprocess(text))
    
y_train_en = label_encoder.transform(train_en['task1'])

100%|██████████| 3436/3436 [00:08<00:00, 389.83it/s]


In [27]:
X_test_en = np.zeros(shape=(test_en.shape[0], 200))

for i, text in tqdm(enumerate(test_en['text']), total=test_en.shape[0]):
    X_test_en[i, :] = doc2vec_en.infer_vector(text_field_en.preprocess(text))
    
y_test_en = label_encoder.transform(test_en['task1'])

100%|██████████| 2208/2208 [00:05<00:00, 387.68it/s]


In [28]:
clf_en = LogisticRegression(max_iter=1000)
%time clf_en.fit(X_train_en, y_train_en)

y_pred_en = clf_en.predict(X_test_en)

print(classification_report(y_test_en, y_pred_en))
print(f'Accuracy: {round(accuracy_score(y_test_en, y_pred_en), 4)}')

CPU times: user 173 ms, sys: 225 ms, total: 398 ms
Wall time: 29.9 ms
              precision    recall  f1-score   support

           0       0.54      0.61      0.58      1050
           1       0.60      0.53      0.57      1158

    accuracy                           0.57      2208
   macro avg       0.57      0.57      0.57      2208
weighted avg       0.57      0.57      0.57      2208

Accuracy: 0.5707


#### Spanish
Al igual que los vectores usados en GloVe haré que los embeddings the Doc2Vec tengan dimesión igual a 300.

In [29]:
documents_es = [TaggedDocument(doc, [i]) for doc, i in zip(preprocessed_train_text_es, train_es['task1'])]
%time doc2vec_es = Doc2Vec(documents_es, vector_size=300, window=5, min_count=1, workers=4, seed=42, epochs=50)

CPU times: user 23.8 s, sys: 1.76 s, total: 25.5 s
Wall time: 12.3 s


In [30]:
X_train_es = np.zeros(shape=(train_es.shape[0], 300))

for i, text in tqdm(enumerate(train_es['text']), total=train_es.shape[0]):
    X_train_es[i, :] = doc2vec_es.infer_vector(text_field_es.preprocess(text))
    
y_train_es = label_encoder.transform(train_es['task1'])

100%|██████████| 3541/3541 [00:09<00:00, 366.71it/s]


In [31]:
X_test_es = np.zeros(shape=(test_es.shape[0], 300))

for i, text in tqdm(enumerate(test_es['text']), total=test_es.shape[0]):
    X_test_es[i, :] = doc2vec_es.infer_vector(text_field_es.preprocess(text))
    
y_test_es = label_encoder.transform(test_es['task1'])

100%|██████████| 2160/2160 [00:05<00:00, 371.47it/s]


In [32]:
clf_es = LogisticRegression(max_iter=1000)
%time clf_es.fit(X_train_es, y_train_es)

y_pred_es = clf_es.predict(X_test_es)

print(classification_report(y_test_es, y_pred_es))
print(f'Accuracy: {round(accuracy_score(y_test_es, y_pred_es), 4)}')

CPU times: user 249 ms, sys: 385 ms, total: 634 ms
Wall time: 44.3 ms
              precision    recall  f1-score   support

           0       0.56      0.58      0.57      1037
           1       0.60      0.59      0.59      1123

    accuracy                           0.58      2160
   macro avg       0.58      0.58      0.58      2160
weighted avg       0.58      0.58      0.58      2160

Accuracy: 0.5829


#### Total

In [33]:
y_test = np.hstack((y_test_en, y_test_es))
y_pred = np.hstack((y_pred_en, y_pred_es))

print(classification_report(y_test, y_pred, target_names=['non-sexist', 'sexist']))
print(f'Accuracy: {round(accuracy_score(y_test, y_pred), 4)}')

              precision    recall  f1-score   support

  non-sexist       0.55      0.60      0.57      2087
      sexist       0.60      0.56      0.58      2281

    accuracy                           0.58      4368
   macro avg       0.58      0.58      0.58      4368
weighted avg       0.58      0.58      0.58      4368

Accuracy: 0.5767


### sentence-BERT

En [Pretrained Models](https://www.sbert.net/docs/pretrained_models.html) están todos los modelos preentrenados que hay. Los modelos fuertes en una tarea, serán débiles para otra tarea, por lo tanto, es importante seleccionar el modelo adecuado para cada tarea. Como no hay ninguno en específico para la tarea de análisis de sentimientos usaré **paraphrase-distilroberta-base-v1**, el cual recomiendan para varias aplicaciones.

**distiluse-base-multilingual-cased-v1:** Multilingual knowledge distilled version of multilingual Universal Sentence Encoder. Supports 15 languages: Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish.

In [34]:
# Descagamos el modelo
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

#### English

In [35]:
sentences_train_en = [' '.join(review) for review in preprocessed_train_text_en]
sentences_test_en = [' '.join(review) for review in preprocessed_test_text_en]

In [36]:
# Cada embedding tiene dimensión igual a 768
%time X_train_en = model.encode(sentences_train_en, show_progress_bar=True)
%time X_test_en = model.encode(sentences_test_en, show_progress_bar=True)

y_train_en = label_encoder.transform(train_en['task1'])
y_test_en = label_encoder.transform(test_en['task1'])

Batches:   0%|          | 0/108 [00:00<?, ?it/s]

CPU times: user 6.43 s, sys: 603 ms, total: 7.04 s
Wall time: 5.81 s


Batches:   0%|          | 0/69 [00:00<?, ?it/s]

CPU times: user 3.45 s, sys: 146 ms, total: 3.6 s
Wall time: 2.76 s


In [37]:
clf_en = LogisticRegression(max_iter=1000)
%time clf_en.fit(X_train_en, y_train_en)

y_pred_en = clf_en.predict(X_test_en)

print(classification_report(y_test_en, y_pred_en))
print(f'Accuracy: {round(accuracy_score(y_test_en, y_pred_en), 4)}')

CPU times: user 155 ms, sys: 377 ms, total: 532 ms
Wall time: 42.5 ms
              precision    recall  f1-score   support

           0       0.70      0.75      0.73      1050
           1       0.76      0.71      0.73      1158

    accuracy                           0.73      2208
   macro avg       0.73      0.73      0.73      2208
weighted avg       0.73      0.73      0.73      2208

Accuracy: 0.7301


#### Spanish

In [38]:
sentences_train_es = [' '.join(review) for review in preprocessed_train_text_es]
sentences_test_es = [' '.join(review) for review in preprocessed_test_text_es]

In [39]:
# Cada embedding tiene dimensión igual a 768
%time X_train_es = model.encode(sentences_train_es, show_progress_bar=True)
%time X_test_es = model.encode(sentences_test_es, show_progress_bar=True)

y_train_es = label_encoder.transform(train_es['task1'])
y_test_es = label_encoder.transform(test_es['task1'])

Batches:   0%|          | 0/111 [00:00<?, ?it/s]

CPU times: user 5.35 s, sys: 141 ms, total: 5.49 s
Wall time: 4.2 s


Batches:   0%|          | 0/68 [00:00<?, ?it/s]

CPU times: user 3.37 s, sys: 78.8 ms, total: 3.45 s
Wall time: 2.65 s


In [40]:
clf_es = LogisticRegression(max_iter=1000)
%time clf_es.fit(X_train_es, y_train_es)

y_pred_es = clf_es.predict(X_test_es)

print(classification_report(y_test_es, y_pred_es))
print(f'Accuracy: {round(accuracy_score(y_test_es, y_pred_es), 4)}')

CPU times: user 337 ms, sys: 592 ms, total: 930 ms
Wall time: 68.8 ms
              precision    recall  f1-score   support

           0       0.67      0.79      0.73      1037
           1       0.77      0.64      0.70      1123

    accuracy                           0.71      2160
   macro avg       0.72      0.72      0.71      2160
weighted avg       0.72      0.71      0.71      2160

Accuracy: 0.712


#### Total

In [41]:
y_test = np.hstack((y_test_en, y_test_es))
y_pred = np.hstack((y_pred_en, y_pred_es))

print(classification_report(y_test, y_pred, target_names=['non-sexist', 'sexist']))
print(f'Accuracy: {round(accuracy_score(y_test, y_pred), 4)}')

              precision    recall  f1-score   support

  non-sexist       0.68      0.77      0.73      2087
      sexist       0.76      0.68      0.72      2281

    accuracy                           0.72      4368
   macro avg       0.72      0.72      0.72      4368
weighted avg       0.73      0.72      0.72      4368

Accuracy: 0.7212


## Resultados

De los distintos modelos el mejor parece ser el de BERT que ha sido entrenado con varios idiomas.

- Al hacer la clasificación sin hacer el spell-check se obtiene:

<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-c3ow{border-color:inherit;text-align:center;vertical-align:top}
.tg .tg-7btt{border-color:inherit;font-weight:bold;text-align:center;vertical-align:top}
</style>
<table class="tg">
<thead>
  <tr>
    <th class="tg-7btt" rowspan="2">Model</th>
    <th class="tg-7btt" colspan="3">Task-1 (accuracy)</th>
  </tr>
  <tr>
    <td class="tg-7btt">English</td>
    <td class="tg-7btt">Spanish</td>
    <td class="tg-7btt">Total</td>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-c3ow">tf-idf</td>
    <td class="tg-c3ow">70.70</td>
    <td class="tg-c3ow">69.81</td>
    <td class="tg-c3ow">70.26</td>
  </tr>
  <tr>
    <td class="tg-c3ow">GloVe</td>
    <td class="tg-c3ow">66.35</td>
    <td class="tg-c3ow">67.82</td>
    <td class="tg-c3ow">67.08</td>
  </tr>
  <tr>
    <td class="tg-c3ow">Doc2Vec</td>
    <td class="tg-c3ow">57.43</td>
    <td class="tg-c3ow">59.31</td>
    <td class="tg-c3ow">58.36</td>
  </tr>
  <tr>
    <td class="tg-c3ow">sentence-BERT</td>
    <td class="tg-c3ow">73.73</td>
    <td class="tg-c3ow">72.64</td>
    <td class="tg-c3ow">73.19</td>
  </tr>
</tbody>
</table>

- Al hacer la clasificación con el spell-check se obtiene:

<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-c3ow{border-color:inherit;text-align:center;vertical-align:top}
.tg .tg-7btt{border-color:inherit;font-weight:bold;text-align:center;vertical-align:top}
</style>
<table class="tg">
<thead>
  <tr>
    <th class="tg-7btt" rowspan="2">Model</th>
    <th class="tg-7btt" colspan="3">Task-1 (accuracy)</th>
  </tr>
  <tr>
    <td class="tg-7btt">English</td>
    <td class="tg-7btt">Spanish</td>
    <td class="tg-7btt">Total</td>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-c3ow">tf-idf</td>
    <td class="tg-c3ow">70.11</td>
    <td class="tg-c3ow">70.60</td>
    <td class="tg-c3ow">70.35</td>
  </tr>
  <tr>
    <td class="tg-c3ow">GloVe</td>
    <td class="tg-c3ow">66.85</td>
    <td class="tg-c3ow">67.55</td>
    <td class="tg-c3ow">67.19</td>
  </tr>
  <tr>
    <td class="tg-c3ow">Doc2Vec</td>
    <td class="tg-c3ow">57.07</td>
    <td class="tg-c3ow">58.29</td>
    <td class="tg-c3ow">57.67</td>
  </tr>
  <tr>
    <td class="tg-c3ow">sentence-BERT</td>
    <td class="tg-c3ow">73.01</td>
    <td class="tg-c3ow">71.20</td>
    <td class="tg-c3ow">72.12</td>
  </tr>
</tbody>
</table>

En realidad parece empeorar un poco con el spell-check (principalmente en sentence-BERT que es el que mejor sale), además de que es algo tardado hacerlo, por lo que mejor no usaré el spell-check. Por otro lado, hacer el pre-procesamiento parece ayudar un poco.