In [1]:
import json
from joblib import load, dump
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [2]:
data = pd.read_csv("data/tickets_inputs_eng_2.csv")

In [3]:
data.describe()

Unnamed: 0,complaint_what_happened,ticket_classification,processed_text,relevant_topics
count,18961,18961,18961,18961
unique,18822,78,18714,3
top,Chase has violated 15 USC 1692 by continuing c...,Credit card or prepaid card + General-purpose ...,chase continu collect activ report complet cre...,Mortgage/Loan
freq,11,4918,12,16376


In [4]:
data.head(2)

Unnamed: 0,complaint_what_happened,ticket_classification,processed_text,relevant_topics
0,Good morning my name is XXXX XXXX and I apprec...,Debt collection + Credit card debt,morn name appreci chase bank cardmemb servic c...,Mortgage/Loan
1,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card + General-purpose ...,card anniversari date inform order account ann...,Mortgage/Loan


In [5]:
data["relevant_topics"].value_counts()

relevant_topics
Mortgage/Loan                    16376
Bank Account Services             2358
Credit Report or Prepaid Card      227
Name: count, dtype: int64

In [6]:
X_text = data["processed_text"]
y_encoded = data["relevant_topics"]

In [7]:
def read_idx2label(json_path: str) -> pd.Series:
    """This function read the json file and return a dictionary
    Args:
      json_path (str): path to the json file
     Returns:
      idx2label (dict): dictionary with the mapping"""
    with open(json_path) as f:
        idx2label = json.load(f)
    return idx2label

idx2label = read_idx2label(json_path="../topic_mapping_1.json")

In [8]:
def decode_labels_into_idx(labels: pd.Series, idx2label: dict) -> pd.Series:
    """This function decode the labels into idx
    Args:
      labels (pd.Series): series with the labels
      idx2label (dict): dictionary with the mapping
     Returns:
      labels (pd.Series): series with the labels decoded
    """
    return labels.map(idx2label)

In [9]:
label2idx = {value: key for key, value in idx2label.items()}
y = decode_labels_into_idx(labels=y_encoded, idx2label=label2idx)

In [10]:
y

0        2
1        2
2        2
3        0
4        2
        ..
18956    2
18957    2
18958    2
18959    2
18960    2
Name: relevant_topics, Length: 18961, dtype: object

In [11]:
vectorizer =TfidfVectorizer()
X = vectorizer.fit_transform(X_text)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [13]:
majority_class = y_train.value_counts().idxmax()
print(majority_class)

2


In [14]:
dummy_classifer = DummyClassifier(strategy="constant", constant=majority_class)
dummy_classifer.fit(X_train, y_train)

In [15]:
baseline_predictions = dummy_classifer.predict(X_test)
print(classification_report(y_test, baseline_predictions))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       705
           1       0.00      0.00      0.00        79
           2       0.86      1.00      0.93      4905

    accuracy                           0.86      5689
   macro avg       0.29      0.33      0.31      5689
weighted avg       0.74      0.86      0.80      5689



  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
unique_classes, counts = np.unique(baseline_predictions, return_counts=True)

for label, count in zip(unique_classes, counts):
    print(f'Clase: {label}, Cantidad de predicciones: {count}')

Clase: 2, Cantidad de predicciones: 5689


Los datos de predicción muestran una distribución en la que la clase 2 tiene una cantidad significativamente mayor de predicciones en comparación con las otras clases. Mientras tanto, en los datos reales de prueba (y_test), se observa una distribución más equilibrada entre las clases.

Esta discrepancia entre las predicciones del modelo baseline y los datos reales subraya la simplicidad del modelo baseline y su incapacidad para capturar la complejidad del problema de clasificación. Es importante tener en cuenta que este modelo simple se usa principalmente como punto de referencia inicial y no refleja el rendimiento real de un modelo de clasificación más sofisticado. En problemas reales, se esperaría que un modelo más avanzado mejore significativamente estos resultados.

Si necesitas un modelo más preciso, es recomendable explorar modelos de clasificación más complejos y técnicas de ajuste de hiperparámetros o incluso modelos de aprendizaje profundo, dependiendo de la naturaleza y la complejidad de tus datos

In [17]:
# Crear la representación TF-IDF del texto  - da más importancia a palabras con mayor peso
from sklearn import svm
tfidf_vectorizer = TfidfVectorizer()

X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Inicializar y entrenar el modelo SVM
svm_classifier = svm.SVC(kernel='linear', class_weight='balanced', random_state=42)
svm_classifier.fit(X_train_tfidf, y_train)

# Realizar predicciones en el conjunto de prueba y train 
predictions_test = svm_classifier.predict(X_test_tfidf)
predictions_train = svm_classifier.predict(X_train_tfidf)

# Medir la precisión del modelo
accuracy = accuracy_score(y_test, predictions_test)
print(f"Precisión del modelo: {accuracy:.2f}")

# Ver el reporte de clasificación
print(classification_report(y_test, predictions_test))

Precisión del modelo: 0.64
              precision    recall  f1-score   support

           0       0.18      0.46      0.26       466
           1       0.10      0.22      0.13        54
           2       0.89      0.68      0.77      3273

    accuracy                           0.64      3793
   macro avg       0.39      0.45      0.39      3793
weighted avg       0.79      0.64      0.70      3793

