In [1]:
!pip install transformers
!pip install wordsegment
!pip install wandb

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.5 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 33.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import os
from tqdm.notebook import tqdm

import torch


In [5]:
import time
import transformers
from transformers import BertModel, BertweetTokenizer, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader

from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

from wordsegment import load, segment
import warnings

In [6]:
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

In [7]:
from sklearn import (
    decomposition,
    ensemble,
    feature_extraction,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    pipeline,
)
from tqdm.notebook import tqdm

In [8]:
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")
sns.set_context("notebook")

tqdm.pandas()

%config InlineBackend.figure_format="retina"  # For high DPI display
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

plt.rcParams['figure.dpi'] = 90
plt.rcParams['figure.figsize'] = (11, 7)

%config InlineBackend.figure_format="retina"  # For high DPI display

load()

cuda:0


# Load dataset

In [9]:
PATH_TO_PROJECT = './drive/MyDrive/TER at INRIA/'

In [10]:
df_det = pd.read_csv(PATH_TO_PROJECT + 'datasets para colab/dataset_deteccion_misoginia.csv')
df_iber = pd.read_csv(PATH_TO_PROJECT + 'datasets para colab/dataset_ibereval.csv')
df_mis = pd.read_csv(PATH_TO_PROJECT + 'datasets para colab/dataset_miscorpus.csv')

df = pd.concat([df_det, df_iber, df_mis], axis=0)
df.reset_index(drop=True, inplace=True)

In [11]:
df

Unnamed: 0,text,dataset,label
0,tu mamá la chismosa pinché vieja de Torreón @...,det_misoginia,1
1,"""Presidenta del sindicato de mamás luchonas"" E...",det_misoginia,1
2,"""Weeey nosotros somos el virus ??"" pues órale ...",det_misoginia,1
3,"""ya sé mucho de ese tema y tengo clarito todo ...",det_misoginia,1
4,"#JuegoSurvivor Todos los Alcones me cagan, bol...",det_misoginia,1
...,...,...,...
12877,@raquelmad16_97 Porque el sexo como categoría ...,miscorpus,0
12878,@InesArrimadas Y este hijo de puta le estamos ...,miscorpus,0
12879,"Acaso me falta información, criterio o sentido...",miscorpus,0
12880,"La #CumbredelClima en cuatro claves: Greta, lo...",miscorpus,0


In [12]:
df.label.value_counts()

0    6504
1    6378
Name: label, dtype: int64

# Data preprocessing

In [13]:
def preprocess_text(text):
    c_text = text
    c_text = re.sub(' +', ' ', c_text)
    c_text = re.sub(r'https?\S+', 'url', c_text)
    c_text = re.sub(r'@\S+', '@user', c_text)
    htags = re.findall( r'#\w+\S+?', c_text)
    for tag in htags:
        c_text = c_text.replace(tag, ' '.join(segment(tag)))
    return c_text

In [14]:
df.shape

(12882, 3)

In [15]:
print(df.label.value_counts())

0    6504
1    6378
Name: label, dtype: int64


In [16]:
df['clean_text'] = df.text.apply(lambda x: preprocess_text(x))

In [17]:
df.head(10)

Unnamed: 0,text,dataset,label,clean_text
0,tu mamá la chismosa pinché vieja de Torreón @...,det_misoginia,1,tu mamá la chismosa pinché vieja de Torreón @...
1,"""Presidenta del sindicato de mamás luchonas"" E...",det_misoginia,1,"""Presidenta del sindicato de mamás luchonas"" E..."
2,"""Weeey nosotros somos el virus ??"" pues órale ...",det_misoginia,1,"""Weeey nosotros somos el virus ??"" pues órale ..."
3,"""ya sé mucho de ese tema y tengo clarito todo ...",det_misoginia,1,"""ya sé mucho de ese tema y tengo clarito todo ..."
4,"#JuegoSurvivor Todos los Alcones me cagan, bol...",det_misoginia,1,"juego survivor Todos los Alcones me cagan, bol..."
5,#JuegoSurvivore cagan las mujeres de halcones ...,det_misoginia,1,juego survivor e cagan las mujeres de halcones...
6,#LadyCinepolis alguien que tenga una hija de l...,det_misoginia,1,lady cine polis alguien que tenga una hija de ...
7,#LadyComeGratis pinche vieja abusona d mierda ...,det_misoginia,1,lady come gratis pinche vieja abusona d mierda...
8,#LadyComeGratis que poca madre! Eso no se vale...,det_misoginia,1,lady come gratis que poca madre! Eso no se val...
9,"#ladycomegratis, alias Brenda Pardo Lemus, es ...",det_misoginia,1,"lady come gratis alias Brenda Pardo Lemus, es ..."


# Modeling

### BETO

In [18]:
PRE_TRAINED_MODEL_NAME = 'dccuchile/bert-base-spanish-wwm-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/236k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/364 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/648 [00:00<?, ?B/s]

In [19]:
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels=2)

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchi

In [20]:
train_df, test_df = model_selection.train_test_split(
    df, test_size=0.30, random_state=1
)

In [21]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(train_df.clean_text, train_df.label, test_size=0.2, random_state=1)

In [22]:
sample_txt = X_train.iloc[5]

tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

 Sentence: Una pregunta tonta ¿esa patera-catamarán cuánto cuesta?
 url vía @user
   Tokens: ['Una', 'pregunta', 'tonta', '¿', 'esa', 'pate', '##ra', '-', 'cata', '##mar', '##án', 'cuánto', 'cuesta', '?', 'ur', '##l', 'vía', '@', 'use', '##r']
Token IDs: [1965, 3850, 13370, 1067, 1792, 17400, 1048, 1149, 25444, 3167, 1176, 6392, 11434, 1064, 2870, 30938, 5254, 968, 15796, 30936]


In [23]:
MAX_LENGTH = 512

In [24]:
X_train_tokenized = tokenizer(list(X_train), padding=True, truncation=True, max_length=MAX_LENGTH)
X_val_tokenized = tokenizer(list(X_val), padding=True, truncation=True, max_length=MAX_LENGTH)

In [25]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [26]:
train_dataset = Dataset(X_train_tokenized, list(y_train))
val_dataset = Dataset(X_val_tokenized, list(y_val))

In [27]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    f1 = metrics.f1_score(y_true=labels, y_pred=pred, average='macro')
    return {"f1_macro": f1}

In [28]:
# Define Trainer
args = TrainingArguments(
    output_dir=PATH_TO_PROJECT + "outputs_colab/beto",
    evaluation_strategy='epoch',
    num_train_epochs=5,
    seed=0,
    load_best_model_at_end=True,
    logging_strategy='epoch',
    save_strategy='epoch',
    metric_for_best_model='eval_f1_macro',
    greater_is_better=True,
    report_to="wandb",
    run_name='BETO Comparison'
)

In [29]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [30]:
# Train pre-trained model
train_results = trainer.train()

***** Running training *****
  Num examples = 7213
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4510
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.4917,0.401976,0.850235
2,0.3481,0.442918,0.84576
3,0.2179,0.637685,0.842539
4,0.1195,0.747708,0.856306
5,0.0558,0.840506,0.854096


***** Running Evaluation *****
  Num examples = 1804
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/TER at INRIA/outputs_colab/beto/checkpoint-902
Configuration saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto/checkpoint-902/config.json
Model weights saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto/checkpoint-902/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1804
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/TER at INRIA/outputs_colab/beto/checkpoint-1804
Configuration saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto/checkpoint-1804/config.json
Model weights saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto/checkpoint-1804/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1804
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/TER at INRIA/outputs_colab/beto/checkpoint-2706
Configuration saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto/checkpoint-2706/config.json
Model weights

In [31]:
eval_report=trainer.evaluate()
print(eval_report)

***** Running Evaluation *****
  Num examples = 1804
  Batch size = 8


{'eval_loss': 0.747707724571228, 'eval_f1_macro': 0.8563061280160857, 'eval_runtime': 28.429, 'eval_samples_per_second': 63.456, 'eval_steps_per_second': 7.95, 'epoch': 5.0}


### Predict on Test Data

In [32]:
X_test_tokenized = tokenizer(list(test_df.clean_text), padding=True, truncation=True, max_length=MAX_LENGTH)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Make prediction
raw_pred, _, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 3865
  Batch size = 8


In [33]:
print(metrics.classification_report(test_df.label, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.863     0.854     0.859      1959
           1      0.852     0.861     0.856      1906

    accuracy                          0.857      3865
   macro avg      0.857     0.857     0.857      3865
weighted avg      0.858     0.857     0.857      3865



In [34]:
trainer.save_model(output_dir=PATH_TO_PROJECT + "modelos_finales/beto")

Saving model checkpoint to ./drive/MyDrive/TER at INRIA/modelos_finales/beto
Configuration saved in ./drive/MyDrive/TER at INRIA/modelos_finales/beto/config.json
Model weights saved in ./drive/MyDrive/TER at INRIA/modelos_finales/beto/pytorch_model.bin


## Multilingual BERT

In [35]:
PRE_TRAINED_MODEL_NAME = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpr45figvt


Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
creating metadata file for /root/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmppkybckg0


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
creating metadata file for /root/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://hug

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
creating metadata file for /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidde

In [36]:
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels=2)

loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/0a3fd51713dcbb4def175c7f85bddc995d5976ce1dde327f99104e4d33069f17.aa7be4c79d76f4066d9b354496ea477c9ee39c5d889156dd1efb680643c2b052
creating metadata file for /root/.cache/huggingface/transformers/0a3fd51713dcbb4def175c7f85bddc995d5976ce1dde327f99104e4d33069f17.aa7be4c79d76f4066d9b354496ea477c9ee39c5d889156dd1efb680643c2b052
loading weights file https://huggingface.co/bert-base-multilingual-cased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/0a3fd51713dcbb4def175c7f85bddc995d5976ce1dde327f99104e4d33069f17.aa7be4c79d76f4066d9b354496ea477c9ee39c5d889156dd1efb680643c2b052
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bi

In [37]:
X_train_tokenized = tokenizer(list(X_train), padding=True, truncation=True, max_length=MAX_LENGTH)
X_val_tokenized = tokenizer(list(X_val), padding=True, truncation=True, max_length=MAX_LENGTH)

In [38]:
train_dataset = Dataset(X_train_tokenized, list(y_train))
val_dataset = Dataset(X_val_tokenized, list(y_val))

In [39]:
# Define Trainer
args = TrainingArguments(
    output_dir=PATH_TO_PROJECT + "outputs_colab/multilingual",
    evaluation_strategy='epoch',
    num_train_epochs=5,
    seed=0,
    load_best_model_at_end=True,
    logging_strategy='epoch',
    save_strategy='epoch',
    metric_for_best_model='eval_f1_macro',
    greater_is_better=True,
    report_to="wandb",
    run_name='Multilingual Comparison'
)

PyTorch: setting up devices


In [40]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [41]:
# Train pre-trained model
train_results = trainer.train()

***** Running training *****
  Num examples = 7213
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4510
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.6939,0.671845,0.600627
2,0.6659,0.650839,0.608467
3,0.6527,0.597809,0.691834
4,0.5842,0.579769,0.717645
5,0.5232,0.548988,0.739444


***** Running Evaluation *****
  Num examples = 1804
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/TER at INRIA/outputs_colab/multilingual/checkpoint-902
Configuration saved in ./drive/MyDrive/TER at INRIA/outputs_colab/multilingual/checkpoint-902/config.json
Model weights saved in ./drive/MyDrive/TER at INRIA/outputs_colab/multilingual/checkpoint-902/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1804
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/TER at INRIA/outputs_colab/multilingual/checkpoint-1804
Configuration saved in ./drive/MyDrive/TER at INRIA/outputs_colab/multilingual/checkpoint-1804/config.json
Model weights saved in ./drive/MyDrive/TER at INRIA/outputs_colab/multilingual/checkpoint-1804/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1804
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/TER at INRIA/outputs_colab/multilingual/checkpoint-2706
Configuration saved in ./drive/MyDrive/TER at INRIA/outp

In [42]:
eval_report=trainer.evaluate()
print(eval_report)

***** Running Evaluation *****
  Num examples = 1804
  Batch size = 8


{'eval_loss': 0.5489883422851562, 'eval_f1_macro': 0.7394437222818122, 'eval_runtime': 35.2403, 'eval_samples_per_second': 51.191, 'eval_steps_per_second': 6.413, 'epoch': 5.0}


In [43]:
X_test_tokenized = tokenizer(list(test_df.clean_text), padding=True, truncation=True, max_length=MAX_LENGTH)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Make prediction
raw_pred, _, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 3865
  Batch size = 8


In [44]:
print(metrics.classification_report(test_df.label, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.754     0.747     0.751      1959
           1      0.743     0.749     0.746      1906

    accuracy                          0.748      3865
   macro avg      0.748     0.748     0.748      3865
weighted avg      0.748     0.748     0.748      3865



In [45]:
trainer.save_model(output_dir=PATH_TO_PROJECT + "modelos_finales/multilingual")

Saving model checkpoint to ./drive/MyDrive/TER at INRIA/modelos_finales/multilingual
Configuration saved in ./drive/MyDrive/TER at INRIA/modelos_finales/multilingual/config.json
Model weights saved in ./drive/MyDrive/TER at INRIA/modelos_finales/multilingual/pytorch_model.bin
