# **Proyecto Equipo 8. Detección temprana de riesgo de suicidio**

In [None]:
# Importar e instalar dependencias
import pandas as pd
import numpy as np
import re
from collections import Counter

# CARGA DE DATOS

In [None]:
from sklearn.model_selection import train_test_split
!pip install gdown
# Descarga el archivo del dataset en inglés de drive usando gdown
url = 'https://drive.google.com/file/d/1pBs7QN2yZa8aV_hE00oLP6s0cyHm9QZI/view?usp=drive_link'
file_id = url.split('/')[-2]
!gdown --id $file_id

# Carga el dataset en inglés usando pandas
sd_E = pd.read_csv('/content/Suicide_Detection_clean_bilingual.csv')

# Descarga el archivo del dataset en español de drive usando gdown
url = 'https://drive.google.com/file/d/15MnzzWYVz7R0KFPgldUi82SWbEjq1W5w/view?usp=drive_link'
file_id = url.split('/')[-2]
!gdown --id $file_id

# Carga el dataset en español usando pandas
sd_S = pd.read_csv('/content/Suicide_Detection_clean_translated.csv')

# Combina los dos datasets
combined_data = pd.concat([sd_E, sd_S], ignore_index=True)

# Shuffle the combined dataset
combined_data = combined_data.sample(frac=1).reset_index(drop=True)

# Split the combined dataset into training and testing sets
train_data, test_data = train_test_split(combined_data, test_size=0.2, random_state=42)

Downloading...
From: https://drive.google.com/uc?id=1pBs7QN2yZa8aV_hE00oLP6s0cyHm9QZI
To: /content/Suicide_Detection_clean_bilingual.csv
100% 9.91M/9.91M [00:00<00:00, 135MB/s]
Downloading...
From: https://drive.google.com/uc?id=15MnzzWYVz7R0KFPgldUi82SWbEjq1W5w
To: /content/Suicide_Detection_clean_translated.csv
100% 7.27M/7.27M [00:00<00:00, 66.5MB/s]


# MODELADO, ENTRENAMIENTO Y EVALUACIÓN

## Modelo DistilBERT

In [None]:
!pip install transformers torch evaluate datasets optuna
import optuna
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertConfig, EarlyStoppingCallback
import torch
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Asumimos que 'sd' y 'pre_processed' ya están definidos
# Sampleo de data para agilizar el testeo del código
#sd = sd.sample(n=20000, random_state=42)
pre_processed = combined_data.copy()

# Separación de dataset en training y testing
train_data, test_data = train_test_split(pre_processed, test_size=0.2, random_state=42)

# Crear un encoder para las etiquetas
label_encoder = LabelEncoder()

# Ajustar el encoder a las etiquetas y transformar
train_data['class'] = label_encoder.fit_transform(train_data['class'])
test_data['class'] = label_encoder.transform(test_data['class'])

# Cargar el tokenizador DistilBERT
model_name = 'distilbert-base-multilingual-cased'
tokenizer_bert = DistilBertTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)

def encode_data(examples):
    return tokenizer_bert(examples['text'], truncation=True, padding='max_length', max_length=128)

# Convertir los datos a datasets de Hugging Face
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Aplicar la codificación a los datasets
train_dataset = train_dataset.map(encode_data, batched=True)
test_dataset = test_dataset.map(encode_data, batched=True)

# Renombrar la columna 'class' a 'labels'
train_dataset = train_dataset.rename_column('class', 'labels')
test_dataset = test_dataset.rename_column('class', 'labels')

# Configurar el formato de los datasets
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Función para calcular métricas
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Map:   0%|          | 0/15998 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [None]:
def objective(trial):
    # Definir los hiperparámetros a optimizar
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 3, 5)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64])
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.0, 0.2)
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.1)
    hidden_dropout_prob = trial.suggest_float("hidden_dropout_prob", 0.1, 0.5)
    attention_probs_dropout_prob = trial.suggest_float("attention_probs_dropout_prob", 0.1, 0.5)

    # Configuración personalizada del modelo DistilBERT
    custom_config = DistilBertConfig.from_pretrained('distilbert-base-multilingual-cased')
    custom_config.hidden_dropout_prob = hidden_dropout_prob
    custom_config.attention_probs_dropout_prob = attention_probs_dropout_prob
    custom_config.num_labels = 2

    # Cargar el modelo DistilBERT con la configuración personalizada
    model = DistilBertForSequenceClassification.from_pretrained(model_name, config=custom_config)

    # Configurar argumentos de entrenamiento
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=64,
        learning_rate=learning_rate,
        warmup_ratio=warmup_ratio,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to='none'
    )

    # Entrenador
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Entrenar el modelo
    trainer.train()

    # Evaluar el modelo
    eval_results = trainer.evaluate()

    return eval_results["eval_f1"]

# Crear un estudio de Optuna

# Create or load an Optuna study using a persistent storage (SQLite database in this case)
study_name = "my_study"  # Choose a name for your study
storage_name = f"sqlite:///{study_name}.db"  # Define the storage location
study = optuna.create_study(
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,  # Load if the study already exists
    direction="maximize"
)

study.optimize(objective, n_trials=5)  # Puedes ajustar el número de trials según tus necesidades

print("Mejor trial:")
trial = study.best_trial

print("Valor:", trial.value)
print("Mejores hiperparámetros:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")





[I 2024-10-29 22:07:07,062] A new study created in RDB with name: my_study


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2263,0.191442,0.9275,0.927502,0.927511,0.9275
2,0.137,0.193366,0.93525,0.935227,0.936699,0.93525
3,0.0661,0.20436,0.941,0.941002,0.941346,0.941
4,0.0621,0.232134,0.944,0.943997,0.944607,0.944
5,0.0632,0.235423,0.946,0.946002,0.946255,0.946


[I 2024-10-29 22:26:40,041] Trial 0 finished with value: 0.9460024840397445 and parameters: {'learning_rate': 1.6428538918460286e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'warmup_ratio': 0.04642184183972891, 'weight_decay': 0.05506492008170195, 'hidden_dropout_prob': 0.3066579240136432, 'attention_probs_dropout_prob': 0.13761244098712752}. Best is trial 0 with value: 0.9460024840397445.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.244,0.20218,0.931,0.930994,0.931733,0.931
2,0.1197,0.203771,0.9325,0.932458,0.934577,0.9325
3,0.064,0.230465,0.94775,0.947752,0.947995,0.94775
4,0.0758,0.258927,0.94625,0.946252,0.946561,0.94625
5,0.0104,0.271305,0.9475,0.947503,0.947563,0.9475


[I 2024-10-29 22:46:55,029] Trial 1 finished with value: 0.9477524982482496 and parameters: {'learning_rate': 3.535586516864203e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'warmup_ratio': 0.12167837882171534, 'weight_decay': 0.08554228179698739, 'hidden_dropout_prob': 0.11404327046814006, 'attention_probs_dropout_prob': 0.10759979921182015}. Best is trial 1 with value: 0.9477524982482496.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2249,0.215189,0.9255,0.925422,0.928469,0.9255
2,0.0951,0.194668,0.944,0.943994,0.944044,0.944
3,0.0204,0.223169,0.946,0.946001,0.946373,0.946
4,0.1086,0.265948,0.9475,0.947499,0.948045,0.9475
5,0.0241,0.286275,0.9515,0.951502,0.951521,0.9515


[I 2024-10-29 23:09:15,120] Trial 2 finished with value: 0.9515018190342241 and parameters: {'learning_rate': 2.891291834286245e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.07898706745087024, 'weight_decay': 0.061503502385077645, 'hidden_dropout_prob': 0.29757922670909154, 'attention_probs_dropout_prob': 0.41663359644105935}. Best is trial 2 with value: 0.9515018190342241.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.235,0.229873,0.92025,0.920157,0.923443,0.92025
2,0.1058,0.193597,0.9415,0.941503,0.941563,0.9415
3,0.0963,0.23641,0.942,0.941998,0.942573,0.942
4,0.0638,0.272616,0.942,0.941981,0.943389,0.942
5,0.0146,0.289751,0.94475,0.944753,0.944918,0.94475


[I 2024-10-29 23:35:24,954] Trial 3 finished with value: 0.9447531942654009 and parameters: {'learning_rate': 2.002525934928923e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.15257757872522446, 'weight_decay': 0.09326768201680648, 'hidden_dropout_prob': 0.25474132148728273, 'attention_probs_dropout_prob': 0.16913923016636156}. Best is trial 2 with value: 0.9515018190342241.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1925,0.192342,0.9285,0.928462,0.93033,0.9285
2,0.1331,0.197638,0.946,0.945983,0.946217,0.946
3,0.0025,0.241672,0.9485,0.948503,0.948553,0.9485
4,0.0997,0.276054,0.94825,0.948252,0.948281,0.94825


[I 2024-10-29 23:54:20,311] Trial 4 finished with value: 0.9485027040481213 and parameters: {'learning_rate': 3.598334057756973e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.09310920150745013, 'weight_decay': 0.08788072312651653, 'hidden_dropout_prob': 0.22445638068934493, 'attention_probs_dropout_prob': 0.13013150807840992}. Best is trial 2 with value: 0.9515018190342241.


Mejor trial:
Valor: 0.9515018190342241
Mejores hiperparámetros:
    learning_rate: 2.891291834286245e-05
    num_train_epochs: 5
    per_device_train_batch_size: 16
    warmup_ratio: 0.07898706745087024
    weight_decay: 0.061503502385077645
    hidden_dropout_prob: 0.29757922670909154
    attention_probs_dropout_prob: 0.41663359644105935


OSError: istilbert-base-multilingual-cased is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
# Entrenar el modelo final con los mejores hiperparámetros
best_params = study.best_params

# Configuración final del modelo
final_config = DistilBertConfig.from_pretrained('distilbert-base-multilingual-cased')
final_config.hidden_dropout_prob = best_params["hidden_dropout_prob"]
final_config.attention_probs_dropout_prob = best_params["attention_probs_dropout_prob"]
final_config.num_labels = 2

final_model = DistilBertForSequenceClassification.from_pretrained(model_name, config=final_config)

final_training_args = TrainingArguments(
    output_dir='./final_results',
    num_train_epochs=best_params["num_train_epochs"],
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    per_device_eval_batch_size=64,
    learning_rate=best_params["learning_rate"],
    warmup_ratio=best_params["warmup_ratio"],
    weight_decay=best_params["weight_decay"],
    logging_dir='./final_logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to='none'
)

final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Entrenar el modelo final
final_trainer.train()

# Evaluar el modelo final
final_eval_results = final_trainer.evaluate()

print("Resultados finales de evaluación:")
print(final_eval_results)

# Realizar predicciones en el conjunto de prueba
predictions = final_trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=-1)

accuracy = accuracy_score(test_dataset['labels'], predicted_labels)
precision = precision_score(test_dataset['labels'], predicted_labels, average='weighted')
recall = recall_score(test_dataset['labels'], predicted_labels, average='weighted')
f1 = f1_score(test_dataset['labels'], predicted_labels, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2045,0.224379,0.92425,0.924145,0.927931,0.92425
2,0.1044,0.19144,0.94375,0.943751,0.94411,0.94375
3,0.0551,0.26148,0.94375,0.943745,0.944514,0.94375
4,0.0458,0.2695,0.94575,0.945752,0.945763,0.94575
5,0.0232,0.305091,0.94775,0.947752,0.947789,0.94775


Resultados finales de evaluación:
{'eval_loss': 0.3050908148288727, 'eval_accuracy': 0.94775, 'eval_f1': 0.9477524985193438, 'eval_precision': 0.947789235797554, 'eval_recall': 0.94775, 'eval_runtime': 16.9514, 'eval_samples_per_second': 235.969, 'eval_steps_per_second': 3.717, 'epoch': 5.0}
Accuracy: 0.94775
Precision: 0.947789235797554
Recall: 0.94775
F1-Score: 0.9477524985193438


In [None]:
# Guardar solo los pesos (más común y recomendado)
torch.save(model.state_dict(), 'pesos_modelo_suc.pth')

In [None]:
!zip -r /content/pesos_modelo_suc.pth.zip /content/pesos_modelo.pth

  adding: content/pesos_modelo.pth (deflated 7%)


In [None]:
from google.colab import files
# Después de guardar el modelo, ejecuta:
files.download('pesos_modelo_suc.pth.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>