In [1]:
import sys
import os
from functions import load_data, preprocess_text, train_classifier, evaluate_model
from sklearn.model_selection import train_test_split
import mlflow
from sklearn.metrics import accuracy_score, precision_score, recall_score

# 1. Imports y configuración
sys.path.append(os.getcwd())


# 2. Exploración de Datos
X, y, target_names = load_data()

print("Número de documentos:", len(X))
print("Número de categorías:", len(target_names))
print("Categorías:", target_names)
print("\nEjemplo de texto:\n", X[0][:500])

# 3. Preprocesamiento de Texto
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_vect, X_test_vect, vectorizer = preprocess_text(X_train, X_test)
print("Shape de la matriz TF-IDF (train):", X_train_vect.shape)

# 4. Entrenamiento del Modelo
clf = train_classifier(X_train_vect, y_train, C=1.0, max_iter=1000)

# 5. Evaluación del Modelo
evaluate_model(clf, X_test_vect, y_test, target_names)

# 6. Registro de métricas con MLflow

mlflow.set_experiment("text_classification_practica_final")
with mlflow.start_run():
    mlflow.log_param("C", 1.0)
    mlflow.log_param("max_iter", 1000)
    y_pred = clf.predict(X_test_vect)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    print("Métricas registradas en MLflow")

# 7. Insertar capturas de pantalla de MLflow

Total documentos: 18846
Número de categorías: 20
Primer texto:
 From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killin
Número de documentos: 18846
Número de categorías: 20
Categorías: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc'

2025/07/05 13:04:51 INFO mlflow.tracking.fluent: Experiment with name 'text_classification_practica_final' does not exist. Creating a new experiment.


Entrenamiento completado.
Informe de clasificación:
                           precision    recall  f1-score   support

             alt.atheism       0.87      0.91      0.89       151
           comp.graphics       0.77      0.83      0.80       202
 comp.os.ms-windows.misc       0.80      0.83      0.82       195
comp.sys.ibm.pc.hardware       0.68      0.73      0.71       183
   comp.sys.mac.hardware       0.87      0.84      0.86       205
          comp.windows.x       0.88      0.82      0.85       215
            misc.forsale       0.84      0.79      0.81       193
               rec.autos       0.91      0.92      0.92       196
         rec.motorcycles       0.96      0.95      0.95       168
      rec.sport.baseball       0.94      0.96      0.95       211
        rec.sport.hockey       0.94      0.97      0.96       198
               sci.crypt       0.98      0.94      0.96       201
         sci.electronics       0.84      0.86      0.85       202
                 sci.m

In [2]:
import mlflow

# Definir los valores para probar C y max_iter
C_values = [0.5, 1.0, 2.0]
max_iter_values = [500, 1000, 2000]

mlflow.set_experiment("text_classification_practica_final")

for C in C_values:
    for max_iter in max_iter_values:
        # Entrenamiento del modelo
        clf = train_classifier(X_train_vect, y_train, C=C, max_iter=max_iter)
        
        # Evaluación del modelo
        metrics = evaluate_model(clf, X_test_vect, y_test, target_names)
        
        # Registro de métricas en MLflow
        with mlflow.start_run():
            mlflow.log_param("C", C)
            mlflow.log_param("max_iter", max_iter)
            mlflow.log_metric("accuracy", metrics["accuracy"])
            mlflow.log_metric("precision", metrics["precision"])
            mlflow.log_metric("recall", metrics["recall"])
            print(f"Run con C={C}, max_iter={max_iter} registrado en MLflow.")



Entrenamiento completado.
Informe de clasificación:
                           precision    recall  f1-score   support

             alt.atheism       0.85      0.88      0.86       151
           comp.graphics       0.76      0.83      0.79       202
 comp.os.ms-windows.misc       0.77      0.82      0.79       195
comp.sys.ibm.pc.hardware       0.68      0.72      0.69       183
   comp.sys.mac.hardware       0.86      0.83      0.85       205
          comp.windows.x       0.86      0.80      0.83       215
            misc.forsale       0.82      0.78      0.80       193
               rec.autos       0.89      0.91      0.90       196
         rec.motorcycles       0.94      0.93      0.94       168
      rec.sport.baseball       0.94      0.95      0.95       211
        rec.sport.hockey       0.92      0.97      0.95       198
               sci.crypt       0.98      0.94      0.96       201
         sci.electronics       0.81      0.83      0.82       202
                 sci.m



Entrenamiento completado.
Informe de clasificación:
                           precision    recall  f1-score   support

             alt.atheism       0.85      0.88      0.86       151
           comp.graphics       0.76      0.83      0.79       202
 comp.os.ms-windows.misc       0.77      0.82      0.79       195
comp.sys.ibm.pc.hardware       0.68      0.72      0.69       183
   comp.sys.mac.hardware       0.86      0.83      0.85       205
          comp.windows.x       0.86      0.80      0.83       215
            misc.forsale       0.82      0.78      0.80       193
               rec.autos       0.89      0.91      0.90       196
         rec.motorcycles       0.94      0.93      0.94       168
      rec.sport.baseball       0.94      0.95      0.95       211
        rec.sport.hockey       0.92      0.97      0.95       198
               sci.crypt       0.98      0.94      0.96       201
         sci.electronics       0.81      0.83      0.82       202
                 sci.m



Entrenamiento completado.
Informe de clasificación:
                           precision    recall  f1-score   support

             alt.atheism       0.85      0.88      0.86       151
           comp.graphics       0.76      0.83      0.79       202
 comp.os.ms-windows.misc       0.77      0.82      0.79       195
comp.sys.ibm.pc.hardware       0.68      0.72      0.69       183
   comp.sys.mac.hardware       0.86      0.83      0.85       205
          comp.windows.x       0.86      0.80      0.83       215
            misc.forsale       0.82      0.78      0.80       193
               rec.autos       0.89      0.91      0.90       196
         rec.motorcycles       0.94      0.93      0.94       168
      rec.sport.baseball       0.94      0.95      0.95       211
        rec.sport.hockey       0.92      0.97      0.95       198
               sci.crypt       0.98      0.94      0.96       201
         sci.electronics       0.81      0.83      0.82       202
                 sci.m



Entrenamiento completado.
Informe de clasificación:
                           precision    recall  f1-score   support

             alt.atheism       0.87      0.91      0.89       151
           comp.graphics       0.77      0.83      0.80       202
 comp.os.ms-windows.misc       0.80      0.83      0.82       195
comp.sys.ibm.pc.hardware       0.68      0.73      0.71       183
   comp.sys.mac.hardware       0.87      0.84      0.86       205
          comp.windows.x       0.88      0.82      0.85       215
            misc.forsale       0.84      0.79      0.81       193
               rec.autos       0.91      0.92      0.92       196
         rec.motorcycles       0.96      0.95      0.95       168
      rec.sport.baseball       0.94      0.96      0.95       211
        rec.sport.hockey       0.94      0.97      0.96       198
               sci.crypt       0.98      0.94      0.96       201
         sci.electronics       0.84      0.86      0.85       202
                 sci.m



Entrenamiento completado.
Informe de clasificación:
                           precision    recall  f1-score   support

             alt.atheism       0.87      0.91      0.89       151
           comp.graphics       0.77      0.83      0.80       202
 comp.os.ms-windows.misc       0.80      0.83      0.82       195
comp.sys.ibm.pc.hardware       0.68      0.73      0.71       183
   comp.sys.mac.hardware       0.87      0.84      0.86       205
          comp.windows.x       0.88      0.82      0.85       215
            misc.forsale       0.84      0.79      0.81       193
               rec.autos       0.91      0.92      0.92       196
         rec.motorcycles       0.96      0.95      0.95       168
      rec.sport.baseball       0.94      0.96      0.95       211
        rec.sport.hockey       0.94      0.97      0.96       198
               sci.crypt       0.98      0.94      0.96       201
         sci.electronics       0.84      0.86      0.85       202
                 sci.m



Entrenamiento completado.
Informe de clasificación:
                           precision    recall  f1-score   support

             alt.atheism       0.87      0.91      0.89       151
           comp.graphics       0.77      0.83      0.80       202
 comp.os.ms-windows.misc       0.80      0.83      0.82       195
comp.sys.ibm.pc.hardware       0.68      0.73      0.71       183
   comp.sys.mac.hardware       0.87      0.84      0.86       205
          comp.windows.x       0.88      0.82      0.85       215
            misc.forsale       0.84      0.79      0.81       193
               rec.autos       0.91      0.92      0.92       196
         rec.motorcycles       0.96      0.95      0.95       168
      rec.sport.baseball       0.94      0.96      0.95       211
        rec.sport.hockey       0.94      0.97      0.96       198
               sci.crypt       0.98      0.94      0.96       201
         sci.electronics       0.84      0.86      0.85       202
                 sci.m



Entrenamiento completado.
Informe de clasificación:
                           precision    recall  f1-score   support

             alt.atheism       0.88      0.91      0.90       151
           comp.graphics       0.78      0.83      0.80       202
 comp.os.ms-windows.misc       0.84      0.83      0.83       195
comp.sys.ibm.pc.hardware       0.69      0.74      0.72       183
   comp.sys.mac.hardware       0.86      0.85      0.86       205
          comp.windows.x       0.87      0.85      0.86       215
            misc.forsale       0.84      0.80      0.82       193
               rec.autos       0.91      0.92      0.91       196
         rec.motorcycles       0.97      0.95      0.96       168
      rec.sport.baseball       0.96      0.96      0.96       211
        rec.sport.hockey       0.95      0.98      0.97       198
               sci.crypt       0.98      0.94      0.96       201
         sci.electronics       0.86      0.87      0.87       202
                 sci.m



Entrenamiento completado.
Informe de clasificación:
                           precision    recall  f1-score   support

             alt.atheism       0.88      0.91      0.90       151
           comp.graphics       0.78      0.83      0.80       202
 comp.os.ms-windows.misc       0.84      0.83      0.83       195
comp.sys.ibm.pc.hardware       0.69      0.74      0.72       183
   comp.sys.mac.hardware       0.86      0.85      0.86       205
          comp.windows.x       0.87      0.85      0.86       215
            misc.forsale       0.84      0.80      0.82       193
               rec.autos       0.91      0.92      0.91       196
         rec.motorcycles       0.97      0.95      0.96       168
      rec.sport.baseball       0.96      0.96      0.96       211
        rec.sport.hockey       0.95      0.98      0.97       198
               sci.crypt       0.98      0.94      0.96       201
         sci.electronics       0.86      0.87      0.87       202
                 sci.m



Entrenamiento completado.
Informe de clasificación:
                           precision    recall  f1-score   support

             alt.atheism       0.88      0.91      0.90       151
           comp.graphics       0.78      0.83      0.80       202
 comp.os.ms-windows.misc       0.84      0.83      0.83       195
comp.sys.ibm.pc.hardware       0.69      0.74      0.72       183
   comp.sys.mac.hardware       0.86      0.85      0.86       205
          comp.windows.x       0.87      0.85      0.86       215
            misc.forsale       0.84      0.80      0.82       193
               rec.autos       0.91      0.92      0.91       196
         rec.motorcycles       0.97      0.95      0.96       168
      rec.sport.baseball       0.96      0.96      0.96       211
        rec.sport.hockey       0.95      0.98      0.97       198
               sci.crypt       0.98      0.94      0.96       201
         sci.electronics       0.86      0.87      0.87       202
                 sci.m

# 7. Insertar capturas de pantalla de MLflow y explicación de resultados

### 1. Listado de ejecuciones del experimento
![Runs MLflow](./Captura%20desde%202025-07-05%2013-20-47.png)

---

### 2. Métricas del run: chill-kite-412
#### a) Métricas del modelo
![chill-kite-412 - Métricas](./Captura%20desde%202025-07-05%2013-23-10.png)
#### b) Resumen del run
![chill-kite-412 - Resumen](./Captura%20desde%202025-07-05%2013-23-03.png)

---

### 3. Métricas del run: honorable-sow-890
#### a) Métricas del modelo
![honorable-sow-890 - Métricas](./Captura%20desde%202025-07-05%2013-22-51.png)
#### b) Resumen del run
![honorable-sow-890 - Resumen](./Captura%20desde%202025-07-05%2013-22-42.png)

---

### 4. Métricas del run: indecisive-sow-548
#### a) Métricas del modelo
![indecisive-sow-548 - Métricas](./Captura%20desde%202025-07-05%2013-22-27.png)
#### b) Resumen del run
![indecisive-sow-548 - Resumen](./Captura%20desde%202025-07-05%2013-22-19.png)
El experimento muestra cómo los hiperparámetros `C` y `max_iter` afectan el rendimiento del modelo de clasificación de texto. Al variar estos valores y registrar las métricas en MLflow, se observa que los mejores resultados de accuracy, precision y recall se obtienen con `C=2.0` y `max_iter=2000`, alcanzando una accuracy de aproximadamente 0.89. Esto indica que un mayor valor de `C` (menos regularización) y más iteraciones permiten al modelo ajustarse mejor a los datos, aunque siempre es importante evitar el sobreajuste. MLflow facilita la comparación y el análisis de los diferentes experimentos realizados.