# **Proyecto Equipo 8. Detección temprana de riesgo de suicidio**

In [None]:
# Importar e instalar dependencias
import pandas as pd
import numpy as np
import re
from collections import Counter

# CARGA DE DATOS

In [None]:
!pip install gdown
# Descarga el archivo del dataset de drive usando gdown
url = 'https://drive.google.com/file/d/1z3fmc6QKE71_OQ3Poy-fTq1DpJcoK-Z_/view?usp=drive_link'
file_id = url.split('/')[-2]
!gdown --id $file_id

# Carga el dataset usando pandas
sd = pd.read_csv('/content/Suicide_Detection_clean_2.csv')

Downloading...
From (original): https://drive.google.com/uc?id=1z3fmc6QKE71_OQ3Poy-fTq1DpJcoK-Z_
From (redirected): https://drive.google.com/uc?id=1z3fmc6QKE71_OQ3Poy-fTq1DpJcoK-Z_&confirm=t&uuid=7f0bac94-f4ff-4a20-8ce8-f6401f0897d4
To: /content/Suicide_Detection_clean_2.csv
100% 227M/227M [00:01<00:00, 143MB/s]


# MODELADO, ENTRENAMIENTO Y EVALUACIÓN

## Modelo DistilBERT

In [None]:
!pip install transformers torch evaluate datasets optuna
import optuna
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertConfig, EarlyStoppingCallback
import torch
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Asumimos que 'sd' y 'pre_processed' ya están definidos
# Sampleo de data para agilizar el testeo del código
sd = sd.sample(n=20000, random_state=42)
pre_processed = sd.copy()

# Separación de dataset en training y testing
train_data, test_data = train_test_split(pre_processed, test_size=0.2, random_state=42)

# Crear un encoder para las etiquetas
label_encoder = LabelEncoder()

# Ajustar el encoder a las etiquetas y transformar
train_data['class'] = label_encoder.fit_transform(train_data['class'])
test_data['class'] = label_encoder.transform(test_data['class'])

# Cargar el tokenizador DistilBERT
model_name = 'distilbert-base-uncased'
tokenizer_bert = DistilBertTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)

def encode_data(examples):
    return tokenizer_bert(examples['text'], truncation=True, padding='max_length', max_length=128)

# Convertir los datos a datasets de Hugging Face
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Aplicar la codificación a los datasets
train_dataset = train_dataset.map(encode_data, batched=True)
test_dataset = test_dataset.map(encode_data, batched=True)

# Renombrar la columna 'class' a 'labels'
train_dataset = train_dataset.rename_column('class', 'labels')
test_dataset = test_dataset.rename_column('class', 'labels')

# Configurar el formato de los datasets
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Función para calcular métricas
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
INFO: pip is loo

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [None]:
def objective(trial):
    # Definir los hiperparámetros a optimizar
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 3, 8)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64])
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.0, 0.2)
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.1)
    hidden_dropout_prob = trial.suggest_float("hidden_dropout_prob", 0.1, 0.5)
    attention_probs_dropout_prob = trial.suggest_float("attention_probs_dropout_prob", 0.1, 0.5)

    # Configuración personalizada del modelo DistilBERT
    custom_config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
    custom_config.hidden_dropout_prob = hidden_dropout_prob
    custom_config.attention_probs_dropout_prob = attention_probs_dropout_prob
    custom_config.num_labels = 2

    # Cargar el modelo DistilBERT con la configuración personalizada
    model = DistilBertForSequenceClassification.from_pretrained(model_name, config=custom_config)

    # Configurar argumentos de entrenamiento
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=64,
        learning_rate=learning_rate,
        warmup_ratio=warmup_ratio,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
    )

    # Entrenador
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Entrenar el modelo
    trainer.train()

    # Evaluar el modelo
    eval_results = trainer.evaluate()

    return eval_results["eval_f1"]

# Crear un estudio de Optuna

# Create or load an Optuna study using a persistent storage (SQLite database in this case)
study_name = "my_study"  # Choose a name for your study
storage_name = f"sqlite:///{study_name}.db"  # Define the storage location
study = optuna.create_study(
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,  # Load if the study already exists
    direction="maximize"
)

study.optimize(objective, n_trials=5)  # Puedes ajustar el número de trials según tus necesidades

print("Mejor trial:")
trial = study.best_trial

print("Valor:", trial.value)
print("Mejores hiperparámetros:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Entrenar el modelo final con los mejores hiperparámetros
best_params = study.best_params

# Configuración final del modelo
final_config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
final_config.hidden_dropout_prob = best_params["hidden_dropout_prob"]
final_config.attention_probs_dropout_prob = best_params["attention_probs_dropout_prob"]
final_config.num_labels = 2

final_model = DistilBertForSequenceClassification.from_pretrained(model_name, config=final_config)

final_training_args = TrainingArguments(
    output_dir='./final_results',
    num_train_epochs=best_params["num_train_epochs"],
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    per_device_eval_batch_size=64,
    learning_rate=best_params["learning_rate"],
    warmup_ratio=best_params["warmup_ratio"],
    weight_decay=best_params["weight_decay"],
    logging_dir='./final_logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Entrenar el modelo final
final_trainer.train()

# Evaluar el modelo final
final_eval_results = final_trainer.evaluate()

print("Resultados finales de evaluación:")
print(final_eval_results)

# Realizar predicciones en el conjunto de prueba
predictions = final_trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=-1)

accuracy = accuracy_score(test_dataset['labels'], predicted_labels)
precision = precision_score(test_dataset['labels'], predicted_labels, average='weighted')
recall = recall_score(test_dataset['labels'], predicted_labels, average='weighted')
f1 = f1_score(test_dataset['labels'], predicted_labels, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")



[I 2024-10-17 14:25:56,878] A new study created in RDB with name: my_study


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1395,0.132626,0.95425,0.954245,0.954304,0.95425
2,0.1345,0.116149,0.9615,0.961499,0.961509,0.9615
3,0.0855,0.130674,0.961,0.960994,0.96111,0.961
4,0.0623,0.145272,0.96125,0.961245,0.961328,0.96125
5,0.0394,0.158712,0.96125,0.961249,0.961254,0.96125


[I 2024-10-17 14:41:30,858] Trial 0 finished with value: 0.9614985559250832 and parameters: {'learning_rate': 1.0746566580811717e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 32, 'warmup_ratio': 0.1523165249347074, 'weight_decay': 0.05388053983243142, 'hidden_dropout_prob': 0.16050991134845016, 'attention_probs_dropout_prob': 0.3727671631916408}. Best is trial 0 with value: 0.9614985559250832.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1284,0.116318,0.95825,0.958237,0.958513,0.95825
2,0.0518,0.117459,0.95675,0.956735,0.957057,0.95675
3,0.0362,0.144484,0.961,0.960997,0.961041,0.961
4,0.0212,0.17754,0.9645,0.964498,0.964514,0.9645
5,0.0023,0.185977,0.963,0.962999,0.963003,0.963
6,0.0046,0.209428,0.96375,0.963751,0.963779,0.96375
7,0.0011,0.213421,0.963,0.963,0.963,0.963


[I 2024-10-17 15:02:11,097] Trial 1 finished with value: 0.964498348853312 and parameters: {'learning_rate': 4.4436581037951944e-05, 'num_train_epochs': 7, 'per_device_train_batch_size': 64, 'warmup_ratio': 0.04726479107333825, 'weight_decay': 0.06278275708063547, 'hidden_dropout_prob': 0.38781322868721324, 'attention_probs_dropout_prob': 0.29699285752327564}. Best is trial 1 with value: 0.964498348853312.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1633,0.145708,0.94875,0.948709,0.949653,0.94875
2,0.083,0.119005,0.9565,0.956469,0.957285,0.9565
3,0.0645,0.161144,0.95175,0.951691,0.953268,0.95175
4,0.0239,0.195353,0.955,0.95497,0.955744,0.955
5,0.0072,0.181515,0.96475,0.964751,0.964841,0.96475
6,0.002,0.202846,0.96325,0.963251,0.963267,0.96325
7,0.0004,0.204974,0.9625,0.962499,0.962501,0.9625


[I 2024-10-17 15:22:50,867] Trial 2 finished with value: 0.9647513681858606 and parameters: {'learning_rate': 3.242765985747258e-05, 'num_train_epochs': 7, 'per_device_train_batch_size': 64, 'warmup_ratio': 0.15388053592928025, 'weight_decay': 0.0512759710884516, 'hidden_dropout_prob': 0.4812207762931957, 'attention_probs_dropout_prob': 0.3955136133368543}. Best is trial 2 with value: 0.9647513681858606.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1565,0.15107,0.95425,0.954211,0.955251,0.95425
2,0.0687,0.127139,0.96275,0.962739,0.962995,0.96275
3,0.0093,0.178044,0.95975,0.95973,0.96025,0.95975
4,0.0283,0.202301,0.96325,0.96325,0.96325,0.96325
5,0.0538,0.207633,0.9615,0.961498,0.961514,0.9615


[I 2024-10-17 15:39:53,904] Trial 3 finished with value: 0.9632501125637666 and parameters: {'learning_rate': 1.9620272496607295e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.04329566469747129, 'weight_decay': 0.06828769888705809, 'hidden_dropout_prob': 0.3162826509014548, 'attention_probs_dropout_prob': 0.3310290203935703}. Best is trial 2 with value: 0.9647513681858606.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1421,0.134172,0.95625,0.956233,0.956606,0.95625
2,0.0722,0.134366,0.962,0.961993,0.962141,0.962
3,0.0555,0.185309,0.9605,0.960476,0.961148,0.9605
4,0.0368,0.194369,0.9625,0.962498,0.962519,0.9625
