In [2]:
!nvidia-smi

Tue Mar 22 21:34:11 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install transformers
!pip install wordsegment
!pip install wandb

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 6.9 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 51.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 3.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 36.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 36.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import os
from tqdm.notebook import tqdm

import torch


In [6]:
import time
import transformers
from transformers import BertModel, BertweetTokenizer, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader

from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

from wordsegment import load, segment
import warnings

In [7]:
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

In [8]:
from sklearn import (
    decomposition,
    ensemble,
    feature_extraction,
    linear_model,
    metrics,
    model_selection,
    naive_bayes,
    pipeline,
)
from tqdm.notebook import tqdm

In [9]:
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")
sns.set_context("notebook")

tqdm.pandas()

%config InlineBackend.figure_format="retina"  # For high DPI display
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

plt.rcParams['figure.dpi'] = 90
plt.rcParams['figure.figsize'] = (11, 7)

%config InlineBackend.figure_format="retina"  # For high DPI display

load()

cuda:0


# Load dataset

In [10]:
PATH_TO_PROJECT = './drive/MyDrive/TER at INRIA/'

In [11]:
df_det = pd.read_csv(PATH_TO_PROJECT + 'datasets para colab/dataset_deteccion_misoginia.csv')
df_iber = pd.read_csv(PATH_TO_PROJECT + 'datasets para colab/dataset_ibereval.csv')
df_mis = pd.read_csv(PATH_TO_PROJECT + 'datasets para colab/dataset_miscorpus.csv')

df = pd.concat([df_det, df_iber, df_mis], axis=0)
df.reset_index(drop=True, inplace=True)

In [12]:
df

Unnamed: 0,text,dataset,label,variation
0,tu mamá la chismosa pinché vieja de Torreón @...,det_misoginia,1,latam
1,"""Presidenta del sindicato de mamás luchonas"" E...",det_misoginia,1,latam
2,"""Weeey nosotros somos el virus ??"" pues órale ...",det_misoginia,1,latam
3,"""ya sé mucho de ese tema y tengo clarito todo ...",det_misoginia,1,latam
4,"#JuegoSurvivor Todos los Alcones me cagan, bol...",det_misoginia,1,latam
...,...,...,...,...
12877,@raquelmad16_97 Porque el sexo como categoría ...,miscorpus,0,mixed
12878,@InesArrimadas Y este hijo de puta le estamos ...,miscorpus,0,europe
12879,"Acaso me falta información, criterio o sentido...",miscorpus,0,europe
12880,"La #CumbredelClima en cuatro claves: Greta, lo...",miscorpus,0,mixed


In [13]:
df.label.value_counts()

0    6504
1    6378
Name: label, dtype: int64

# Data preprocessing

In [14]:
def preprocess_text(text):
    c_text = text
    c_text = re.sub(' +', ' ', c_text)
    c_text = re.sub(r'https?\S+', 'url', c_text)
    c_text = re.sub(r'@\S+', '@user', c_text)
    htags = re.findall( r'#\w+\S+?', c_text)
    for tag in htags:
        c_text = c_text.replace(tag, ' '.join(segment(tag)))
    return c_text

In [15]:
df.shape

(12882, 4)

In [16]:
print(df.label.value_counts())

0    6504
1    6378
Name: label, dtype: int64


In [17]:
df['clean_text'] = df.text.apply(lambda x: preprocess_text(x))

In [18]:
df.head(10)

Unnamed: 0,text,dataset,label,variation,clean_text
0,tu mamá la chismosa pinché vieja de Torreón @...,det_misoginia,1,latam,tu mamá la chismosa pinché vieja de Torreón @...
1,"""Presidenta del sindicato de mamás luchonas"" E...",det_misoginia,1,latam,"""Presidenta del sindicato de mamás luchonas"" E..."
2,"""Weeey nosotros somos el virus ??"" pues órale ...",det_misoginia,1,latam,"""Weeey nosotros somos el virus ??"" pues órale ..."
3,"""ya sé mucho de ese tema y tengo clarito todo ...",det_misoginia,1,latam,"""ya sé mucho de ese tema y tengo clarito todo ..."
4,"#JuegoSurvivor Todos los Alcones me cagan, bol...",det_misoginia,1,latam,"juego survivor Todos los Alcones me cagan, bol..."
5,#JuegoSurvivore cagan las mujeres de halcones ...,det_misoginia,1,latam,juego survivor e cagan las mujeres de halcones...
6,#LadyCinepolis alguien que tenga una hija de l...,det_misoginia,1,latam,lady cine polis alguien que tenga una hija de ...
7,#LadyComeGratis pinche vieja abusona d mierda ...,det_misoginia,1,latam,lady come gratis pinche vieja abusona d mierda...
8,#LadyComeGratis que poca madre! Eso no se vale...,det_misoginia,1,latam,lady come gratis que poca madre! Eso no se val...
9,"#ladycomegratis, alias Brenda Pardo Lemus, es ...",det_misoginia,1,latam,"lady come gratis alias Brenda Pardo Lemus, es ..."


In [19]:
df['variation'].value_counts()

mixed     6800
latam     3596
europe    2486
Name: variation, dtype: int64

In [20]:
df_latam = df.loc[df['variation'] == 'latam']
df_europe = df.loc[df['variation'] == 'europe']

# Modeling

### BETO Trained on LatAm data

In [20]:
PRE_TRAINED_MODEL_NAME = 'dccuchile/bert-base-spanish-wwm-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/236k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/364 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/648 [00:00<?, ?B/s]

In [21]:
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels=2)

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchi

In [22]:
train_df_latam, test_df_latam = model_selection.train_test_split(
    df_latam, test_size=0.30, random_state=1
)

train_df_europe, test_df_europe = model_selection.train_test_split(
    df_europe, test_size=0.30, random_state=1
)

In [32]:
df_europe.label.value_counts(normalize=True)

1    0.518504
0    0.481496
Name: label, dtype: float64

In [33]:
df_latam.label.value_counts(normalize=True)

1    0.616796
0    0.383204
Name: label, dtype: float64

In [None]:
print('Latam train:', train_df_latam.shape, '\tLatam test:', test_df_latam.shape)
print('Europe train:', train_df_europe.shape, '\tEurope test:', test_df_europe.shape)

Latam train: (2517, 5) 	Latam test: (1079, 5)
Europe train: (1740, 5) 	Europe test: (746, 5)


In [None]:
train_df_latam = train_df_latam.sample(n=train_df_europe.shape[0], replace=False, random_state=0)
test_df_latam = test_df_latam.sample(n=test_df_europe.shape[0], replace=False, random_state=0)

In [None]:
print('Latam train:', train_df_latam.shape, '\tLatam test:', test_df_latam.shape)
print('Europe train:', train_df_europe.shape, '\tEurope test:', test_df_europe.shape)

Latam train: (1740, 5) 	Latam test: (746, 5)
Europe train: (1740, 5) 	Europe test: (746, 5)


In [None]:
X_train_latam, X_val_latam, y_train_latam, y_val_latam = model_selection.train_test_split(train_df_latam.clean_text, train_df_latam.label, test_size=0.2, random_state=1)

In [None]:
sample_txt = X_train_latam.iloc[5]

tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

 Sentence: juego survivor e cagan las mujeres de halcones de todo se quejan así son los juegos mejor no hubieran ido cuando ellas hacen algo porque no se quejan solo lo que les conviene Cara aburrida
   Tokens: ['juego', 'sur', '##vi', '##vor', 'e', 'ca', '##gan', 'las', 'mujeres', 'de', 'hal', '##cones', 'de', 'todo', 'se', 'queja', '##n', 'así', 'son', 'los', 'juegos', 'mejor', 'no', 'hubieran', 'ido', 'cuando', 'ellas', 'hacen', 'algo', 'porque', 'no', 'se', 'queja', '##n', 'solo', 'lo', 'que', 'les', 'conviene', 'Cara', 'aburrida']
Token IDs: [3343, 2902, 1128, 11198, 1007, 1285, 1362, 1089, 2409, 1008, 5440, 19895, 1008, 1397, 1062, 15113, 30935, 1506, 1404, 1065, 6832, 1627, 1084, 10358, 4485, 1486, 3878, 3818, 1513, 1817, 1084, 1062, 15113, 30935, 1942, 1114, 1038, 2027, 12133, 6325, 24037]


In [None]:
MAX_LENGTH = 512

In [None]:
X_train_tokenized = tokenizer(list(X_train_latam), padding=True, truncation=True, max_length=MAX_LENGTH)
X_val_tokenized = tokenizer(list(X_val_latam), padding=True, truncation=True, max_length=MAX_LENGTH)

In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(X_train_tokenized, list(y_train_latam))
val_dataset = Dataset(X_val_tokenized, list(y_val_latam))

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    f1 = metrics.f1_score(y_true=labels, y_pred=pred, average='macro')
    return {"f1_macro": f1}

In [None]:
# Define Trainer
args = TrainingArguments(
    output_dir=PATH_TO_PROJECT + "outputs_colab/beto_latam",
    evaluation_strategy='epoch',
    num_train_epochs=5,
    seed=0,
    load_best_model_at_end=True,
    logging_strategy='epoch',
    save_strategy='epoch',
    metric_for_best_model='eval_f1_macro',
    greater_is_better=True,
    report_to="wandb",
    run_name='BETO LatAm Fixed'
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

API Key: 327c6e1124de6e2165aa7b8c4e195dae9689ad33

In [None]:
# Train pre-trained model
train_results = trainer.train()

***** Running training *****
  Num examples = 1392
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 870
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.5519,0.500911,0.753533
2,0.3519,0.603167,0.816808
3,0.1429,0.881501,0.785474
4,0.0516,1.090711,0.795859
5,0.015,1.088588,0.798093


***** Running Evaluation *****
  Num examples = 348
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/TER at INRIA/outputs_colab/beto_latam/checkpoint-174
Configuration saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto_latam/checkpoint-174/config.json
Model weights saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto_latam/checkpoint-174/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 348
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/TER at INRIA/outputs_colab/beto_latam/checkpoint-348
Configuration saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto_latam/checkpoint-348/config.json
Model weights saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto_latam/checkpoint-348/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 348
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/TER at INRIA/outputs_colab/beto_latam/checkpoint-522
Configuration saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto_latam/

In [None]:
eval_report=trainer.evaluate()
print(eval_report)

***** Running Evaluation *****
  Num examples = 348
  Batch size = 8


{'eval_loss': 0.6031667590141296, 'eval_f1_macro': 0.8168077873642698, 'eval_runtime': 4.6817, 'eval_samples_per_second': 74.332, 'eval_steps_per_second': 9.398, 'epoch': 5.0}


In [None]:
trainer.save_model(output_dir=PATH_TO_PROJECT + "modelos_finales/beto_latam")

Saving model checkpoint to ./drive/MyDrive/TER at INRIA/modelos_finales/beto_latam
Configuration saved in ./drive/MyDrive/TER at INRIA/modelos_finales/beto_latam/config.json
Model weights saved in ./drive/MyDrive/TER at INRIA/modelos_finales/beto_latam/pytorch_model.bin


### Predict on Test Data

#### LatAm data

In [None]:
X_test_tokenized = tokenizer(list(test_df_latam.clean_text), padding=True, truncation=True, max_length=MAX_LENGTH)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Make prediction
raw_pred, _, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 746
  Batch size = 8


In [None]:
print(metrics.classification_report(test_df_latam.label, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.846     0.732     0.785       284
           1      0.848     0.918     0.881       462

    accuracy                          0.847       746
   macro avg      0.847     0.825     0.833       746
weighted avg      0.847     0.847     0.845       746



#### Europe data

In [None]:
X_test_tokenized = tokenizer(list(test_df_europe.clean_text), padding=True, truncation=True, max_length=MAX_LENGTH)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Make prediction
raw_pred, _, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 746
  Batch size = 8


In [None]:
print(metrics.classification_report(test_df_europe.label, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.881     0.623     0.730       369
           1      0.713     0.918     0.803       377

    accuracy                          0.772       746
   macro avg      0.797     0.771     0.766       746
weighted avg      0.796     0.772     0.767       746



### BETO Trained on Europe data

- look for: Gradient accumulation
- check the default batch size
- change max_length

In [None]:
X_train_europe, X_val_europe, y_train_europe, y_val_europe = model_selection.train_test_split(train_df_europe.clean_text, train_df_europe.label, test_size=0.2, random_state=1)

In [None]:
X_train_tokenized = tokenizer(list(X_train_europe), padding=True, truncation=True, max_length=MAX_LENGTH)
X_val_tokenized = tokenizer(list(X_val_europe), padding=True, truncation=True, max_length=MAX_LENGTH)

In [None]:
train_dataset = Dataset(X_train_tokenized, list(y_train_europe))
val_dataset = Dataset(X_val_tokenized, list(y_val_europe))

In [None]:
# Define Trainer
args = TrainingArguments(
    output_dir=PATH_TO_PROJECT + "outputs_colab/beto_europe",
    evaluation_strategy='epoch',
    num_train_epochs=5,
    seed=0,
    load_best_model_at_end=True,
    logging_strategy='epoch',
    save_strategy='epoch',
    metric_for_best_model='eval_f1_macro',
    greater_is_better=True,
    report_to="wandb",
    run_name='BETO Europe Fixed'
)

PyTorch: setting up devices


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
# Train pre-trained model
train_results = trainer.train()

***** Running training *****
  Num examples = 1392
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 870
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.4385,0.543666,0.821732
2,0.2256,0.371315,0.913058
3,0.1084,0.499811,0.889031
4,0.0326,0.574289,0.892713
5,0.0053,0.591143,0.904598


***** Running Evaluation *****
  Num examples = 348
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/TER at INRIA/outputs_colab/beto_europe/checkpoint-174
Configuration saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto_europe/checkpoint-174/config.json
Model weights saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto_europe/checkpoint-174/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 348
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/TER at INRIA/outputs_colab/beto_europe/checkpoint-348
Configuration saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto_europe/checkpoint-348/config.json
Model weights saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto_europe/checkpoint-348/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 348
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/TER at INRIA/outputs_colab/beto_europe/checkpoint-522
Configuration saved in ./drive/MyDrive/TER at INRIA/outputs_colab/beto

In [None]:
eval_report=trainer.evaluate()
print(eval_report)

***** Running Evaluation *****
  Num examples = 348
  Batch size = 8


{'eval_loss': 0.37131479382514954, 'eval_f1_macro': 0.9130579613590939, 'eval_runtime': 5.3447, 'eval_samples_per_second': 65.111, 'eval_steps_per_second': 8.232, 'epoch': 5.0}


In [None]:
trainer.save_model(output_dir=PATH_TO_PROJECT + "modelos_finales/beto_europe")

Saving model checkpoint to ./drive/MyDrive/TER at INRIA/modelos_finales/beto_europe
Configuration saved in ./drive/MyDrive/TER at INRIA/modelos_finales/beto_europe/config.json
Model weights saved in ./drive/MyDrive/TER at INRIA/modelos_finales/beto_europe/pytorch_model.bin


### Predict on Test Data

#### Europe Data

In [None]:
X_test_tokenized = tokenizer(list(test_df_europe.clean_text), padding=True, truncation=True, max_length=MAX_LENGTH)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Make prediction
raw_pred, _, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 746
  Batch size = 8


In [None]:
print(metrics.classification_report(test_df_europe.label, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.918     0.905     0.911       369
           1      0.908     0.920     0.914       377

    accuracy                          0.913       746
   macro avg      0.913     0.913     0.913       746
weighted avg      0.913     0.913     0.913       746



#### LatAm Data

In [None]:
X_test_tokenized = tokenizer(list(test_df_latam.clean_text), padding=True, truncation=True, max_length=MAX_LENGTH)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Make prediction
raw_pred, _, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 746
  Batch size = 8


In [None]:
print(metrics.classification_report(test_df_latam.label, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.686     0.739     0.712       284
           1      0.832     0.792     0.812       462

    accuracy                          0.772       746
   macro avg      0.759     0.766     0.762       746
weighted avg      0.776     0.772     0.774       746

