In [None]:
!pip install datasets transformers torch

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
from datasets import load_dataset
import warnings
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import gc
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os

# ignore warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# load dataset
dataset = load_dataset("samzirbo/europarl.en-es.gendered")

print("Libraries and datasets loaded!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/144 [00:00<?, ?B/s]

europarl.en-es.simple.json:   0%|          | 0.00/526M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1419507 [00:00<?, ? examples/s]

Libraries and datasets loaded!


In [None]:
# convert to pandas
df = dataset['train'].to_pandas()

# remove neutral samples
df = df[df['gender'] != 'neutral']

# balance classes by random undersampling
min_samples = df['gender'].value_counts().min()
df_balanced = df.groupby('gender').apply(lambda x: x.sample(min_samples, random_state=42)).reset_index(drop=True)

# only keep English text
df_balanced = df_balanced[['en', 'gender']]

df_balanced

  df_balanced = df.groupby('gender').apply(lambda x: x.sample(min_samples, random_state=42)).reset_index(drop=True)


Unnamed: 0,en,gender
0,This creativity and innovation will indeed be ...,female
1,"Much to the delight of Parliament , the Counci...",female
2,As far as the rights of artists are concerned ...,female
3,Special tax levels for employers who employ Ro...,female
4,This shows that there has to be coordination .,female
...,...,...
742303,"Madam President , I would like first of all to...",male
742304,"For the United States and Canada , for example...",male
742305,Mr Deß said that money is going into shady pro...,male
742306,I cannot imagine a responsible development str...,male


In [None]:
# clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9.,!?\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_balanced['cleaned_text'] = df_balanced['en'].apply(clean_text)

# encode labels
label_encoder = LabelEncoder()
df_balanced['label'] = label_encoder.fit_transform(df_balanced['gender'])

df_balanced

Unnamed: 0,en,gender,cleaned_text,label
0,This creativity and innovation will indeed be ...,female,this creativity and innovation will indeed be ...,0
1,"Much to the delight of Parliament , the Counci...",female,"much to the delight of parliament , the counci...",0
2,As far as the rights of artists are concerned ...,female,as far as the rights of artists are concerned ...,0
3,Special tax levels for employers who employ Ro...,female,special tax levels for employers who employ ro...,0
4,This shows that there has to be coordination .,female,this shows that there has to be coordination .,0
...,...,...,...,...
742303,"Madam President , I would like first of all to...",male,"madam president , i would like first of all to...",1
742304,"For the United States and Canada , for example...",male,"for the united states and canada , for example...",1
742305,Mr Deß said that money is going into shady pro...,male,mr de said that money is going into shady proj...,1
742306,I cannot imagine a responsible development str...,male,i cannot imagine a responsible development str...,1


In [5]:
# split dataset
X_train, X_test, y_train, y_test = train_test_split(df_balanced['cleaned_text'], df_balanced['label'], test_size=0.25, random_state=63)

# load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-large", num_labels=2)

# define dataset class
class GenderDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# create datasets
train_dataset = GenderDataset(X_train, y_train, tokenizer)
test_dataset = GenderDataset(X_test, y_test, tokenizer)

# define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none",
    gradient_accumulation_steps=2,
    warmup_steps=500,
)

# define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# training model
trainer.train()

# saving first epoch
trainer.save_model("./gender_classification_checkpoint")
tokenizer.save_pretrained("./gender_classification_checkpoint")

# clean VRAM
del model
del tokenizer
torch.cuda.empty_cache()
gc.collect()

print("First epoch completed. Model saved. VRAM cleaned.")

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,1.3101,0.657243


First epoch completed. Model saved. VRAM cleaned.
