In [None]:
# Importar e instalar dependencias
import pandas as pd
import numpy as np
import re
from collections import Counter

# CARGA DE DATOS

In [None]:
from sklearn.model_selection import train_test_split
!pip install gdown
# Descarga el archivo del dataset de drive usando gdown
url = 'https://drive.google.com/file/d/1pBs7QN2yZa8aV_hE00oLP6s0cyHm9QZI/view?usp=drive_link'
file_id = url.split('/')[-2]
!gdown --id $file_id

# Carga el dataset en inglés usando pandas
sd_E = pd.read_csv('/content/Suicide_Detection_clean_bilingual.csv')

# Descarga el archivo del dataset en español de drive usando gdown
url = 'https://drive.google.com/file/d/19Nd5ulojA08fjSr96lHsbQRuokFiF4Sl/view?usp=drive_link'
file_id = url.split('/')[-2]
!gdown --id $file_id

# Carga el dataset en español usando pandas
sd_S = pd.read_csv('/content/Suicide_Detection_clean_translated_102224.csv')

# Combina los dos datasets
combined_data = pd.concat([sd_E, sd_S], ignore_index=True)

# Shuffle the combined dataset
combined_data = combined_data.sample(frac=1).reset_index(drop=True)

# Split the combined dataset into training and testing sets
train_data, test_data = train_test_split(combined_data, test_size=0.2, random_state=42)

Downloading...
From: https://drive.google.com/uc?id=1pBs7QN2yZa8aV_hE00oLP6s0cyHm9QZI
To: /content/Suicide_Detection_clean_bilingual.csv
100% 9.91M/9.91M [00:00<00:00, 75.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=19Nd5ulojA08fjSr96lHsbQRuokFiF4Sl
To: /content/Suicide_Detection_clean_translated_102224.csv
100% 7.39M/7.39M [00:00<00:00, 90.4MB/s]


In [None]:
combined_data.info()
combined_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19998 entries, 0 to 19997
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text          19998 non-null  object
 1   class         19998 non-null  object
 2   cleaned_text  19943 non-null  object
dtypes: object(3)
memory usage: 468.8+ KB


Unnamed: 0,text,class,cleaned_text
0,any fortune tellers here?😳 it doesn't matter w...,non-suicide,fortune teller matter kind curious fortune
1,"¿Por qué la lluvia es tan reconfortante? No, e...",non-suicide,lluvia tanto reconfortante serio pasar cada ve...
2,No he sido obligado a salir de mi vida después...,suicide,ser obligar salir vida después apenas empezar ...
3,"Man,I was a weird type of bully You see,everyo...",non-suicide,mani weird type bully image bully crony dunk t...
4,Voy a hacer una nueva charla en grupo de Reddi...,non-suicide,ir hacer nuevo charla grupo reddit sólo querer...


# MODELADO, ENTRENAMIENTO Y EVALUACIÓN

In [None]:
!pip install transformers torch evaluate datasets
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertConfig, EarlyStoppingCallback
import torch
from sklearn.preprocessing import LabelEncoder
import evaluate
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dataset

In [None]:
# Configuración personalizada del modelo DistilBERT
model_name = 'distilbert-base-multilingual-cased'
custom_config = DistilBertConfig.from_pretrained(model_name)
custom_config.hidden_dropout_prob = 0.30
custom_config.attention_probs_dropout_prob = 0.42
custom_config.num_labels = 2

# Cargar el tokenizador y el modelo DistilBERT con la configuración personalizada
tokenizer_bert = DistilBertTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
model = DistilBertForSequenceClassification.from_pretrained(model_name, config=custom_config)

# Crear un encoder para las etiquetas
label_encoder = LabelEncoder()

# Ajustar el encoder a las etiquetas y transformar
train_data['class'] = label_encoder.fit_transform(train_data['class'])
test_data['class'] = label_encoder.transform(test_data['class'])

def encode_data(examples):
# Convert the 'text' column to a list of strings
    texts = examples['text']
    if isinstance(texts, (pd.Series, np.ndarray)):
        texts = texts.tolist()  # Convert Series or array to list
    elif isinstance(texts, str):
        texts = [texts]         # Convert single string to list
    return tokenizer_bert(texts, truncation=True, padding='max_length', max_length=128)

#return tokenizer_bert(examples['text'], truncation=True, padding='max_length', max_length=128)

# Convertir los datos a datasets de Hugging Face
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Aplicar la codificación a los datasets
train_dataset = train_dataset.map(encode_data, batched=True)
test_dataset = test_dataset.map(encode_data, batched=True)

# Renombrar la columna 'class' a 'labels'
train_dataset = train_dataset.rename_column('class', 'labels')
test_dataset = test_dataset.rename_column('class', 'labels')

# Configurar el formato de los datasets
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/15998 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, config=custom_config)
# Configurar argumentos de entrenamiento con los nuevos hiperparámetros
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2.9e-05,
    warmup_ratio=0.08,
    weight_decay=0.06,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to = 'none'
)


# Función para calcular métricas
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Entrenar el modelo
trainer.train()

# Evaluar el modelo
eval_results = trainer.evaluate()

# Imprimir los resultados de evaluación
print(eval_results)

# Realizar predicciones en el conjunto de prueba
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=-1)

accuracy = accuracy_score(test_dataset['labels'], predicted_labels)
precision = precision_score(test_dataset['labels'], predicted_labels, average='weighted')
recall = recall_score(test_dataset['labels'], predicted_labels, average='weighted')
f1 = f1_score(test_dataset['labels'], predicted_labels, average='weighted')


print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1712,0.172163,0.9365,0.936493,0.936733,0.9365
2,0.0797,0.171806,0.9445,0.9445,0.944512,0.9445
3,0.1017,0.205653,0.94675,0.94674,0.947141,0.94675
4,0.0623,0.228647,0.94825,0.948248,0.948321,0.94825
5,0.0217,0.257578,0.9475,0.9475,0.947512,0.9475


{'eval_loss': 0.2286469042301178, 'eval_accuracy': 0.94825, 'eval_f1': 0.9482484636372108, 'eval_precision': 0.9483210179681741, 'eval_recall': 0.94825, 'eval_runtime': 3.4927, 'eval_samples_per_second': 1145.248, 'eval_steps_per_second': 35.789, 'epoch': 5.0}
Accuracy: 0.94825
Precision: 0.9483210179681741
Recall: 0.94825
F1-Score: 0.9482484636372108


In [None]:
# Guardar solo los pesos (más común y recomendado)
torch.save(model.state_dict(), 'pesos_modelo_ult.pth')

In [None]:
!zip -r /content/pesos_modelo_ult.pth.zip /content/pesos_modelo_ult.pth

  adding: content/pesos_modelo_ult.pth (deflated 7%)


In [None]:
from google.colab import files
# Después de guardar el modelo, ejecuta:
files.download('pesos_modelo_ult.pth.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>