In [1]:
import pandas as pd
import seaborn as sns

import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torchinfo import summary

from sklearn.metrics import accuracy_score, classification_report, f1_score

import gdown
from os.path import exists
from os import mkdir

from utils import preprocessing
from utils.evaluation import DataSetText, infer, infer_task2

from utils.evaluation import SexismClassifierTask1
from utils.evaluation import SexismClassifier as SexismClassifierTask2

sns.set_style('darkgrid')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Evaluación

## Modelos

Primero necesitamos descargar los modelos ya entrenados, para ambas tareas.

In [2]:
model_path = 'models/sexism-classifier-task1.pt'

if not exists(model_path):
    # Create directory
    try:
        mkdir('models')
    except Exception as e:
        print(e)     

    # Download the model
    url = 'https://drive.google.com/uc?id=1V0VbdwXDcFP6f0GrdCna1SQqqkZpBOLW'
    gdown.download(url, model_path, quiet=False)

In [3]:
model_task1 = SexismClassifierTask1()
model_task1.load_state_dict(torch.load('models/sexism-classifier-task1.pt'))
model_task1.to(device)
model_task1.eval()

summary(model_task1)

Layer (type:depth-idx)                   Param #
├─BertModel: 1-1                         --
|    └─BertEmbeddings: 2-1               --
|    |    └─Embedding: 3-1               81,315,072
|    |    └─Embedding: 3-2               393,216
|    |    └─Embedding: 3-3               1,536
|    |    └─LayerNorm: 3-4               1,536
|    |    └─Dropout: 3-5                 --
|    └─BertEncoder: 2-2                  --
|    |    └─ModuleList: 3-6              85,054,464
|    └─BertPooler: 2-3                   --
|    |    └─Linear: 3-7                  590,592
|    |    └─Tanh: 3-8                    --
├─Dropout: 1-2                           --
├─Sequential: 1-3                        --
|    └─Linear: 2-4                       1,538
|    └─Softmax: 2-5                      --
Total params: 167,357,954
Trainable params: 167,357,954
Non-trainable params: 0

In [4]:
model_path = 'models/sexism-classifier-task2.pt'

if not exists(model_path):
    # Create directory
    try:
        mkdir('models')
    except Exception as e:
        print(e)     

    # Download the model
        url = 'https://drive.google.com/uc?id=1AtE9iu5OWeTpYTeMa_xrCvsmSuGVyFdJ'
        gdown.download(url, model_path, quiet=False)

In [5]:
model_task2 = SexismClassifierTask2()
model_task2.load_state_dict(torch.load('models/sexism-classifier-task2.pt'))
model_task2.to(device)
model_task2.eval()

summary(model_task2)

Layer (type:depth-idx)                   Param #
├─BertModel: 1-1                         --
|    └─BertEmbeddings: 2-1               --
|    |    └─Embedding: 3-1               81,315,072
|    |    └─Embedding: 3-2               393,216
|    |    └─Embedding: 3-3               1,536
|    |    └─LayerNorm: 3-4               1,536
|    |    └─Dropout: 3-5                 --
|    └─BertEncoder: 2-2                  --
|    |    └─ModuleList: 3-6              85,054,464
|    └─BertPooler: 2-3                   --
|    |    └─Linear: 3-7                  590,592
|    |    └─Tanh: 3-8                    --
├─Dropout: 1-2                           --
├─Sequential: 1-3                        --
|    └─Linear: 2-4                       3,845
|    └─Softmax: 2-5                      --
Total params: 167,360,261
Trainable params: 167,360,261
Non-trainable params: 0

## Datos

In [6]:
test_df = pd.read_csv('../../data/EXIST2021_test.tsv', sep='\t')

# Un simple pre-procesamiento
test_df['text'] = test_df['text'].apply(lambda text: preprocessing.preprocess(text))

# Codificamos las etiquetas
labels_dict = {'ideological-inequality': 0, 'misogyny-non-sexual-violence': 1, 'objectification': 2, 
               'sexual-violence': 3, 'stereotyping-dominance': 4, 'non-sexist': 5}

test_df['label'] = test_df['task2'].apply(lambda x: labels_dict[x])

test_df_en = test_df[test_df['language'] == 'en']
test_df_es = test_df[test_df['language'] == 'es']

test_df.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2,label
0,EXIST2021,6978,gab,en,pennsylvania state rep horrifies with opening ...,non-sexist,non-sexist,5
1,EXIST2021,6979,twitter,en,"he sounds like as ass , and very condescending .",non-sexist,non-sexist,5
2,EXIST2021,6980,twitter,en,"lol ! "" this behavior of not letting men tell ...",sexist,ideological-inequality,0
3,EXIST2021,6981,twitter,en,rights ? i mean yeah most women especially the...,sexist,ideological-inequality,0
4,EXIST2021,6982,twitter,en,the jack manifold appreciation i ’ m seeing is...,non-sexist,non-sexist,5


In [7]:
ds_text_test = DataSetText(test_df)
ds_text_test_en = DataSetText(test_df_en)
ds_text_test_es = DataSetText(test_df_es)

print(f'Test: {len(ds_text_test)}')
print(f'Test en: {len(ds_text_test_en)}')
print(f'Test es: {len(ds_text_test_es)}')

Test: 4368
Test en: 2208
Test es: 2160


In [8]:
BATCH_SIZE = 8

test_dl = DataLoader(
    ds_text_test,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4)

test_en_dl = DataLoader(
    ds_text_test_en,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4)

test_es_dl = DataLoader(
    ds_text_test_es,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4)

## Rendimiento

In [9]:
%time y_test, y_pred = infer_task2(model_task1, model_task2, test_dl)
%time y_test_en, y_pred_en = infer_task2(model_task1, model_task2, test_en_dl)
%time y_test_es, y_pred_es = infer_task2(model_task1, model_task2, test_es_dl)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 546/546 [00:39<00:00, 13.87it/s]
  0%|                                                                                                                                            | 0/276 [00:00<?, ?it/s]

CPU times: user 39 s, sys: 281 ms, total: 39.3 s
Wall time: 39.4 s


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 276/276 [00:20<00:00, 13.61it/s]
  0%|                                                                                                                                            | 0/270 [00:00<?, ?it/s]

CPU times: user 20 s, sys: 231 ms, total: 20.2 s
Wall time: 20.3 s


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 270/270 [00:19<00:00, 13.66it/s]

CPU times: user 19.4 s, sys: 255 ms, total: 19.7 s
Wall time: 19.8 s





En general se tiene que:

In [10]:
print(classification_report(y_test, y_pred, target_names=['ideological-inequality', 'misogyny-non-sexual-violence', 'objectification', 
                                                          'sexual-violence', 'stereotyping-dominance', 'non-sexist']))

print(f'Accuracy: {round(100*accuracy_score(y_test, y_pred), 4)}')
print(f'F1 score: {round(100*f1_score(y_test, y_pred, average="macro"), 4)}')

                              precision    recall  f1-score   support

      ideological-inequality       0.61      0.67      0.64       621
misogyny-non-sexual-violence       0.43      0.35      0.39       472
             objectification       0.42      0.41      0.42       324
             sexual-violence       0.57      0.45      0.50       400
      stereotyping-dominance       0.43      0.58      0.49       464
                  non-sexist       0.75      0.73      0.74      2087

                    accuracy                           0.62      4368
                   macro avg       0.54      0.53      0.53      4368
                weighted avg       0.62      0.62      0.62      4368

Accuracy: 61.6529
F1 score: 53.0829


En inglés se tiene que:

In [11]:
print(classification_report(y_test_en, y_pred_en, target_names=['ideological-inequality', 'misogyny-non-sexual-violence', 'objectification', 
                                                                'sexual-violence', 'stereotyping-dominance', 'non-sexist']))

print(f'Accuracy: {round(100*accuracy_score(y_test_en, y_pred_en), 4)}')
print(f'F1 score: {round(100*f1_score(y_test_en, y_pred_en, average="macro"), 4)}')

                              precision    recall  f1-score   support

      ideological-inequality       0.59      0.62      0.61       333
misogyny-non-sexual-violence       0.40      0.30      0.34       215
             objectification       0.39      0.45      0.41       150
             sexual-violence       0.45      0.48      0.46       198
      stereotyping-dominance       0.42      0.61      0.50       262
                  non-sexist       0.77      0.68      0.72      1050

                    accuracy                           0.59      2208
                   macro avg       0.50      0.52      0.51      2208
                weighted avg       0.61      0.59      0.60      2208

Accuracy: 59.2391
F1 score: 50.8623


En español se tiene que:

In [12]:
print(classification_report(y_test_es, y_pred_es, target_names=['ideological-inequality', 'misogyny-non-sexual-violence', 'objectification', 
                                                                'sexual-violence', 'stereotyping-dominance', 'non-sexist']))

print(f'Accuracy: {round(100*accuracy_score(y_test_es, y_pred_es), 4)}')
print(f'F1 score: {round(100*f1_score(y_test_es, y_pred_es, average="macro"), 4)}')

                              precision    recall  f1-score   support

      ideological-inequality       0.64      0.73      0.68       288
misogyny-non-sexual-violence       0.46      0.40      0.43       257
             objectification       0.47      0.38      0.42       174
             sexual-violence       0.82      0.42      0.56       202
      stereotyping-dominance       0.43      0.54      0.48       202
                  non-sexist       0.73      0.78      0.76      1037

                    accuracy                           0.64      2160
                   macro avg       0.59      0.54      0.55      2160
                weighted avg       0.65      0.64      0.64      2160

Accuracy: 64.1204
F1 score: 55.339
