# **Proyecto Equipo 8. Detección temprana de riesgo de suicidio**

In [None]:
# Importar e instalar dependencias
import pandas as pd
import numpy as np
import re
from collections import Counter

# CARGA DE DATOS

In [None]:
!pip install gdown
# Descarga el archivo del dataset de drive usando gdown
url = 'https://drive.google.com/file/d/1z3fmc6QKE71_OQ3Poy-fTq1DpJcoK-Z_/view?usp=drive_link'
file_id = url.split('/')[-2]
!gdown --id $file_id

# Carga el dataset usando pandas
sd = pd.read_csv('/content/Suicide_Detection_clean_2.csv')

Downloading...
From (original): https://drive.google.com/uc?id=1z3fmc6QKE71_OQ3Poy-fTq1DpJcoK-Z_
From (redirected): https://drive.google.com/uc?id=1z3fmc6QKE71_OQ3Poy-fTq1DpJcoK-Z_&confirm=t&uuid=a2f09dcb-94fc-4701-80fb-945b56da195b
To: /content/Suicide_Detection_clean_2.csv
100% 227M/227M [00:06<00:00, 36.5MB/s]


In [None]:
# Sampleo de data para agilizar el testeo del código
# sd = sd.sample(n=50000, random_state=42)

# MODELADO, ENTRENAMIENTO Y EVALUACIÓN

In [None]:
#Copia del dataset limpio (pre_processed)
pre_processed = sd.copy()

## Modelo DistilBERT

In [None]:
!pip install transformers torch evaluate datasets
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertConfig, EarlyStoppingCallback
import torch
from sklearn.preprocessing import LabelEncoder
import evaluate
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Separación de dataset en training y testing
train_data ,test_data = train_test_split(pre_processed,test_size=0.2,random_state=42)



Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m9.2 M

In [None]:
# Configuración personalizada del modelo DistilBERT
custom_config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
custom_config.hidden_dropout_prob = 0.36
custom_config.attention_probs_dropout_prob = 0.41
# Set num_labels in the config
custom_config.num_labels = 2

# Cargar el tokenizador y el modelo DistilBERT con la configuración personalizada
model_name = 'distilbert-base-uncased'
tokenizer_bert = DistilBertTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)


# Crear un encoder para las etiquetas
label_encoder = LabelEncoder()

# Ajustar el encoder a las etiquetas y transformar
train_data['class'] = label_encoder.fit_transform(train_data['class'])
test_data['class'] = label_encoder.transform(test_data['class'])

def encode_data(examples):
    return tokenizer_bert(examples['text'], truncation=True, padding='max_length', max_length=128)

# Convertir los datos a datasets de Hugging Face
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Aplicar la codificación a los datasets
train_dataset = train_dataset.map(encode_data, batched=True)
test_dataset = test_dataset.map(encode_data, batched=True)

# Renombrar la columna 'class' a 'labels'
train_dataset = train_dataset.rename_column('class', 'labels')
test_dataset = test_dataset.rename_column('class', 'labels')

# Configurar el formato de los datasets
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/185659 [00:00<?, ? examples/s]

Map:   0%|          | 0/46415 [00:00<?, ? examples/s]

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, config=custom_config)

# Configurar argumentos de entrenamiento con los nuevos hiperparámetros
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=8,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=4e-05,
    warmup_ratio=0.1,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to = "none"
)


# Función para calcular métricas
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Entrenar el modelo
trainer.train()

# Evaluar el modelo
eval_results = trainer.evaluate()

# Imprimir los resultados de evaluación
print(eval_results)

# Realizar predicciones en el conjunto de prueba
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=-1)

accuracy = accuracy_score(test_dataset['labels'], predicted_labels)
precision = precision_score(test_dataset['labels'], predicted_labels, average='weighted')
recall = recall_score(test_dataset['labels'], predicted_labels, average='weighted')
f1 = f1_score(test_dataset['labels'], predicted_labels, average='weighted')


print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0794,0.083263,0.971561,0.97156,0.971583,0.971561
2,0.0719,0.07594,0.972724,0.972719,0.973,0.972724
3,0.0415,0.080629,0.974405,0.974405,0.974426,0.974405
4,0.007,0.127815,0.972487,0.972487,0.972551,0.972487
5,0.0049,0.157426,0.973672,0.973672,0.973675,0.973672
6,0.0084,0.165379,0.972315,0.972315,0.972382,0.972315


{'eval_loss': 0.0806293711066246, 'eval_accuracy': 0.9744048260260691, 'eval_f1': 0.9744048351741944, 'eval_precision': 0.9744262978459671, 'eval_recall': 0.9744048260260691, 'eval_runtime': 48.773, 'eval_samples_per_second': 951.653, 'eval_steps_per_second': 14.885, 'epoch': 6.0}
Accuracy: 0.9744048260260691
Precision: 0.9744262978459671
Recall: 0.9744048260260691
F1-Score: 0.9744048351741944


In [None]:
# Guarda el modelo en formato .bin
torch.save(model.state_dict(), 'DistilBERT.bin')

In [None]:
!zip -r /content/DistilBERT.zip /content/DistilBERT.bin



  adding: content/DistilBERT.bin (deflated 8%)


In [None]:
from google.colab import files
files.download('/content/DistilBERT.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>