In [None]:
# Importar e instalar dependencias
import pandas as pd
import numpy as np
import re
from collections import Counter

# CARGA DE DATOS

In [None]:
from sklearn.model_selection import train_test_split
!pip install gdown
# Descarga el archivo del dataset en inglés de drive usando gdown
url = 'https://drive.google.com/file/d/1pBs7QN2yZa8aV_hE00oLP6s0cyHm9QZI/view?usp=drive_link'
file_id = url.split('/')[-2]
!gdown --id $file_id

# Carga el dataset en inglés usando pandas
sd_E = pd.read_csv('/content/Suicide_Detection_clean_bilingual.csv')

# Descarga el archivo del dataset en español de drive usando gdown
url = 'https://drive.google.com/file/d/15MnzzWYVz7R0KFPgldUi82SWbEjq1W5w/view?usp=drive_link'
file_id = url.split('/')[-2]
!gdown --id $file_id

# Carga el dataset en español usando pandas
sd_S = pd.read_csv('/content/Suicide_Detection_clean_translated.csv')

# Combina los dos datasets
combined_data = pd.concat([sd_E, sd_S], ignore_index=True)

# Shuffle the combined dataset
combined_data = combined_data.sample(frac=1).reset_index(drop=True)

# Split the combined dataset into training and testing sets
train_data, test_data = train_test_split(combined_data, test_size=0.2, random_state=42)

# MODELADO, ENTRENAMIENTO Y EVALUACIÓN

## Modelo DistilBERT

In [None]:
!pip install transformers torch evaluate datasets optuna
import optuna
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertConfig, EarlyStoppingCallback
import torch
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Asumimos que 'sd' y 'pre_processed' ya están definidos
# Sampleo de data para agilizar el testeo del código
#sd = sd.sample(n=20000, random_state=42)
pre_processed = combined_data.copy()

# Separación de dataset en training y testing
train_data, test_data = train_test_split(pre_processed, test_size=0.2, random_state=42)

# Crear un encoder para las etiquetas
label_encoder = LabelEncoder()

# Ajustar el encoder a las etiquetas y transformar
train_data['class'] = label_encoder.fit_transform(train_data['class'])
test_data['class'] = label_encoder.transform(test_data['class'])

# Cargar el tokenizador DistilBERT
model_name = 'distilbert-base-multilingual-cased'
tokenizer_bert = DistilBertTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)

def encode_data(examples):
    return tokenizer_bert(examples['text'], truncation=True, padding='max_length', max_length=128)

# Convertir los datos a datasets de Hugging Face
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Aplicar la codificación a los datasets
train_dataset = train_dataset.map(encode_data, batched=True)
test_dataset = test_dataset.map(encode_data, batched=True)

# Renombrar la columna 'class' a 'labels'
train_dataset = train_dataset.rename_column('class', 'labels')
test_dataset = test_dataset.rename_column('class', 'labels')

# Configurar el formato de los datasets
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Función para calcular métricas
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
def objective(trial):
    # Definir los hiperparámetros a optimizar
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 3, 5)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64])
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.0, 0.2)
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.1)
    hidden_dropout_prob = trial.suggest_float("hidden_dropout_prob", 0.1, 0.5)
    attention_probs_dropout_prob = trial.suggest_float("attention_probs_dropout_prob", 0.1, 0.5)

    # Configuración personalizada del modelo DistilBERT
    custom_config = DistilBertConfig.from_pretrained('distilbert-base-multilingual-cased')
    custom_config.hidden_dropout_prob = hidden_dropout_prob
    custom_config.attention_probs_dropout_prob = attention_probs_dropout_prob
    custom_config.num_labels = 2

    # Cargar el modelo DistilBERT con la configuración personalizada
    model = DistilBertForSequenceClassification.from_pretrained(model_name, config=custom_config)

    # Configurar argumentos de entrenamiento
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=64,
        learning_rate=learning_rate,
        warmup_ratio=warmup_ratio,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to='none'
    )

    # Entrenador
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Entrenar el modelo
    trainer.train()

    # Evaluar el modelo
    eval_results = trainer.evaluate()

    return eval_results["eval_f1"]

# Crear un estudio de Optuna

# Create or load an Optuna study using a persistent storage (SQLite database in this case)
study_name = "my_study"  # Choose a name for your study
storage_name = f"sqlite:///{study_name}.db"  # Define the storage location
study = optuna.create_study(
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,  # Load if the study already exists
    direction="maximize"
)

study.optimize(objective, n_trials=5)  # Puedes ajustar el número de trials según tus necesidades

print("Mejor trial:")
trial = study.best_trial

print("Valor:", trial.value)
print("Mejores hiperparámetros:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")





In [None]:
# Entrenar el modelo final con los mejores hiperparámetros
best_params = study.best_params

# Configuración final del modelo
final_config = DistilBertConfig.from_pretrained('distilbert-base-multilingual-cased')
final_config.hidden_dropout_prob = best_params["hidden_dropout_prob"]
final_config.attention_probs_dropout_prob = best_params["attention_probs_dropout_prob"]
final_config.num_labels = 2

final_model = DistilBertForSequenceClassification.from_pretrained(model_name, config=final_config)

final_training_args = TrainingArguments(
    output_dir='./final_results',
    num_train_epochs=best_params["num_train_epochs"],
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    per_device_eval_batch_size=64,
    learning_rate=best_params["learning_rate"],
    warmup_ratio=best_params["warmup_ratio"],
    weight_decay=best_params["weight_decay"],
    logging_dir='./final_logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to='none'
)

final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Entrenar el modelo final
final_trainer.train()

# Evaluar el modelo final
final_eval_results = final_trainer.evaluate()

print("Resultados finales de evaluación:")
print(final_eval_results)

# Realizar predicciones en el conjunto de prueba
predictions = final_trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=-1)

accuracy = accuracy_score(test_dataset['labels'], predicted_labels)
precision = precision_score(test_dataset['labels'], predicted_labels, average='weighted')
recall = recall_score(test_dataset['labels'], predicted_labels, average='weighted')
f1 = f1_score(test_dataset['labels'], predicted_labels, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

In [None]:
# Guardar solo los pesos (más común y recomendado)
torch.save(model.state_dict(), 'pesos_modelo_suc.pth')

In [None]:
!zip -r /content/pesos_modelo_suc.pth.zip /content/pesos_modelo.pth

In [None]:
from google.colab import files
# Después de guardar el modelo, ejecuta:
files.download('pesos_modelo_suc.pth.zip')