In [None]:
# Importar e instalar dependencias
import pandas as pd
import numpy as np
import re
from collections import Counter

# CARGA DE DATOS

In [None]:
!pip install gdown
# Descarga el archivo del dataset de drive usando gdown
url = 'https://drive.google.com/file/d/1LkEJ3rstkdyhUGWi9O2YKQMe0wC_ZyDd/view?usp=sharing'
file_id = url.split('/')[-2]
!gdown --id $file_id

# Carga el dataset usando pandas
sd = pd.read_csv('/content/Suicide_Detection.csv')

Downloading...
From (original): https://drive.google.com/uc?id=1LkEJ3rstkdyhUGWi9O2YKQMe0wC_ZyDd
From (redirected): https://drive.google.com/uc?id=1LkEJ3rstkdyhUGWi9O2YKQMe0wC_ZyDd&confirm=t&uuid=797c9d85-4a43-4911-b127-a7be73b369ec
To: /content/Suicide_Detection.csv
100% 167M/167M [00:02<00:00, 65.2MB/s]


In [None]:
# Sampleo de data para agilizar el testeo del código
sd = sd.sample(n=10000, random_state=42)

# MODELADO, ENTRENAMIENTO Y EVALUACIÓN

In [None]:
#Copia del dataset limpio (pre_processed)
pre_processed = sd.copy()

In [None]:
from sklearn.model_selection import train_test_split

# Separación de dataset en training y testing
train_data ,test_data = train_test_split(pre_processed,test_size=0.2,random_state=10)

print('Training data: ',len(train_data))
print('Testing data: ',len(test_data))


Training data:  8000
Testing data:  2000


### Modelo de regresión logística basado en TFIDF

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, classification_report, recall_score, f1_score, confusion_matrix,  classification_report
import pandas as pd

# TF-IDF
vectorizer_tfidf = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1, 2))
X_tfidf_train = vectorizer_tfidf.fit_transform(train_data['text'])
X_tfidf_test = vectorizer_tfidf.transform(test_data['text'])

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300]}

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_tfidf_train, train_data['class'])


# Entrenamiento del modelo
model = LogisticRegression(penalty='l2', C=10, solver='saga', max_iter=200)
model.fit(X_tfidf_train, train_data['class'])

best_params = grid_search.best_params_
print("Mejores hiperparámetros:", best_params)

model = LogisticRegression(**best_params)
model.fit(X_tfidf_train, train_data['class'])

# Evaluación del modelo
predictions_tfidf = model.predict(X_tfidf_test)
accuracy_tfidf = accuracy_score(test_data['class'], predictions_tfidf)
precision_tfidf = precision_score(test_data['class'], predictions_tfidf, average='weighted')
recall_tfidf = recall_score(test_data['class'], predictions_tfidf, average='weighted')
f1_tfidf = f1_score(test_data['class'], predictions_tfidf, average='weighted')
confusion_mat_tfidf = confusion_matrix(test_data['class'], predictions_tfidf)

print(classification_report(test_data['class'], predictions_tfidf))

print("Accuracy:", accuracy_tfidf)
print("Precision:", precision_tfidf)
print("Recall:", recall_tfidf)
print("F1-Score:", f1_tfidf)
print("Confusion Matrix:\n", confusion_mat_tfidf)

180 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 67, in _check_solver
  

Mejores hiperparámetros: {'C': 100, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
              precision    recall  f1-score   support

 non-suicide       0.92      0.93      0.92       978
     suicide       0.93      0.92      0.92      1022

    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000

Accuracy: 0.9235
Precision: 0.9236205384063819
Recall: 0.9235
F1-Score: 0.9235083211245164
Confusion Matrix:
 [[909  69]
 [ 84 938]]


In [None]:
print("Número de características en TF-IDF:", X_tfidf_train.shape[1])

Número de características en TF-IDF: 90902
