In [3]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm, trange

from torchtext.data import Field
from torchtext.vocab import GloVe
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from transformers import BertTokenizer, BertModel

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
import torch.optim as optim
import torch.nn.functional as F
from torchinfo import summary

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score

import gdown
from utils import preprocessing
from utils.evaluation import DataSetText, SexismClassifier, infer

sns.set_style('darkgrid')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Evaluación

## Modelos

Primero necesitamos descargar los modelos ya entrenados.

In [4]:
!mkdir models

In [3]:
url = 'https://drive.google.com/uc?id=1AtE9iu5OWeTpYTeMa_xrCvsmSuGVyFdJ'
output = 'models/sexism-classifier-task2.pt'

gdown.download(url, output)

In [4]:
model = SexismClassifier()
model.load_state_dict(torch.load('models/sexism-classifier-task2.pt'))
model.to(device)
model.eval()

summary(model)

Layer (type:depth-idx)                   Param #
├─BertModel: 1-1                         --
|    └─BertEmbeddings: 2-1               --
|    |    └─Embedding: 3-1               81,315,072
|    |    └─Embedding: 3-2               393,216
|    |    └─Embedding: 3-3               1,536
|    |    └─LayerNorm: 3-4               1,536
|    |    └─Dropout: 3-5                 --
|    └─BertEncoder: 2-2                  --
|    |    └─ModuleList: 3-6              85,054,464
|    └─BertPooler: 2-3                   --
|    |    └─Linear: 3-7                  590,592
|    |    └─Tanh: 3-8                    --
├─Dropout: 1-2                           --
├─Sequential: 1-3                        --
|    └─Linear: 2-4                       3,845
|    └─Softmax: 2-5                      --
Total params: 167,360,261
Trainable params: 167,360,261
Non-trainable params: 0

## Datos

In [5]:
test_df = pd.read_csv('../../Data/EXIST2021_test.tsv', sep='\t')
test_df = test_df[test_df['task1'] == 'sexist']

# Un simple pre-procesamiento
test_df['text'] = test_df['text'].apply(lambda text: preprocessing.preprocess(text))

# Codificamos las etiquetas
labels_dict = {'ideological-inequality': 0, 'misogyny-non-sexual-violence': 1,
               'objectification': 2, 'sexual-violence': 3, 'stereotyping-dominance': 4}

test_df['label'] = test_df['task2'].apply(lambda x: labels_dict[x])

test_df_en = test_df[test_df['language'] == 'en']
test_df_es = test_df[test_df['language'] == 'es']

test_df.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2,label
2,EXIST2021,6980,twitter,en,"lol ! "" this behavior of not letting men tell ...",sexist,ideological-inequality,0
3,EXIST2021,6981,twitter,en,rights ? i mean yeah most women especially the...,sexist,ideological-inequality,0
7,EXIST2021,6985,twitter,en,stop regarding women as animals who forget tht...,sexist,ideological-inequality,0
8,EXIST2021,6986,gab,en,"yeah , it is rough , but not for women . marri...",sexist,objectification,2
11,EXIST2021,6989,twitter,en,you were publicly harassing a girl by constant...,sexist,misogyny-non-sexual-violence,1


In [6]:
ds_text_test = DataSetText(test_df)
ds_text_test_en = DataSetText(test_df_en)
ds_text_test_es = DataSetText(test_df_es)

print(f'Test: {len(ds_text_test)}')
print(f'Test en: {len(ds_text_test_en)}')
print(f'Test es: {len(ds_text_test_es)}')

Test: 2281
Test en: 1158
Test es: 1123


In [7]:
BATCH_SIZE = 8

test_dl = DataLoader(
    ds_text_test,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4)

test_en_dl = DataLoader(
    ds_text_test_en,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4)

test_es_dl = DataLoader(
    ds_text_test_es,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4)

## Rendimiento

In [9]:
%time y_test, y_pred = infer(model, test_dl)
%time y_test_en, y_pred_en = infer(model, test_en_dl)
%time y_test_es, y_pred_es = infer(model, test_es_dl)

100%|██████████| 286/286 [00:10<00:00, 26.32it/s]
  0%|          | 0/145 [00:00<?, ?it/s]

CPU times: user 10.5 s, sys: 239 ms, total: 10.8 s
Wall time: 10.9 s


100%|██████████| 145/145 [00:05<00:00, 25.94it/s]
  0%|          | 0/141 [00:00<?, ?it/s]

CPU times: user 5.31 s, sys: 203 ms, total: 5.52 s
Wall time: 5.59 s


100%|██████████| 141/141 [00:05<00:00, 25.84it/s]

CPU times: user 5.2 s, sys: 199 ms, total: 5.4 s
Wall time: 5.46 s





En general se tiene que:

In [16]:
print(classification_report(y_test, y_pred, target_names=['ideological-inequality', 'misogyny-non-sexual-violence', 
                                                          'objectification', 'sexual-violence', 'stereotyping-dominance']))

print(f'Accuracy: {round(100*accuracy_score(y_test, y_pred), 4)}')
print(f'F1 score: {round(100*f1_score(y_test, y_pred, average="macro"), 4)}')

                              precision    recall  f1-score   support

      ideological-inequality       0.68      0.80      0.74       621
misogyny-non-sexual-violence       0.61      0.49      0.54       472
             objectification       0.60      0.52      0.56       324
             sexual-violence       0.76      0.61      0.68       400
      stereotyping-dominance       0.54      0.68      0.60       464

                    accuracy                           0.64      2281
                   macro avg       0.64      0.62      0.62      2281
                weighted avg       0.64      0.64      0.63      2281

Accuracy: 63.7001
F1 score: 62.3797


En inglés se tiene que:

In [13]:
print(classification_report(y_test_en, y_pred_en, target_names=['ideological-inequality', 'misogyny-non-sexual-violence', 
                                                          'objectification', 'sexual-violence', 'stereotyping-dominance']))

print(f'Accuracy: {round(100*accuracy_score(y_test_en, y_pred_en), 4)}')
print(f'F1 score: {round(100*f1_score(y_test_en, y_pred_en, average="macro"), 4)}')

                              precision    recall  f1-score   support

      ideological-inequality       0.69      0.75      0.72       333
misogyny-non-sexual-violence       0.59      0.47      0.52       215
             objectification       0.60      0.51      0.55       150
             sexual-violence       0.68      0.66      0.67       198
      stereotyping-dominance       0.58      0.68      0.62       262

                    accuracy                           0.63      1158
                   macro avg       0.63      0.61      0.62      1158
                weighted avg       0.63      0.63      0.63      1158

Accuracy: 63.4715
F1 score: 61.6744


En español se tiene que:

In [15]:
print(classification_report(y_test_es, y_pred_es, target_names=['ideological-inequality', 'misogyny-non-sexual-violence', 
                                                          'objectification', 'sexual-violence', 'stereotyping-dominance']))

print(f'Accuracy: {round(100*accuracy_score(y_test_es, y_pred_es), 4)}')
print(f'F1 score: {round(100*f1_score(y_test_es, y_pred_es, average="macro"), 4)}')

                              precision    recall  f1-score   support

      ideological-inequality       0.67      0.84      0.75       288
misogyny-non-sexual-violence       0.62      0.51      0.56       257
             objectification       0.61      0.53      0.56       174
             sexual-violence       0.89      0.56      0.69       202
      stereotyping-dominance       0.51      0.68      0.58       202

                    accuracy                           0.64      1123
                   macro avg       0.66      0.63      0.63      1123
                weighted avg       0.66      0.64      0.64      1123

Accuracy: 63.9359
F1 score: 62.9523
