# Random Forest With Classification

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Cargar el dataset_
file_path = 'cleaned_final_description_not_nan.csv'
df = pd.read_csv(file_path)

# Filtrar clases de acc_classification con pocas muestras
min_samples = 15  # Define el mínimo de muestras por clase
class_counts = df['acc_classification'].value_counts()
filtered_classes = class_counts[class_counts >= min_samples].index
df = df[df['acc_classification'].isin(filtered_classes)]

# Asegurarse de que la columna unit_total sea de tipo entero
df['unit_total'] = df['unit_total'].astype(int)

# Crear una nueva columna uniendo company_tid y establishment_id
df['company_establishment'] = df['company_tid'].astype(str) + '_' + df['establishment_id'].astype(str)

# Definir las características (features) y la etiqueta (label)
X = df[['company_establishment', 'unit_total', 'classification']]
y = df['acc_classification']

# Dividir el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el preprocesador usando OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['company_establishment', 'classification']),
        ('num', 'passthrough', ['unit_total'])
    ])

# Crear el pipeline con el preprocesador y el modelo de Random Forest
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Entrenar el modelo
pipeline.fit(X_train, y_train)

# Hacer predicciones
y_pred = pipeline.predict(X_test)

# Identificar los registros que no fueron clasificados correctamente
misclassified_mask = y_test != y_pred
misclassified = X_test[misclassified_mask].copy()
misclassified['actual'] = y_test[misclassified_mask].values
misclassified['predicted'] = y_pred[misclassified_mask]

# Guardar los registros que no fueron clasificados correctamente en un archivo CSV
misclassified.to_csv('misclassified_records.csv', index=False)

# Mostrar métricas de desempeño
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.6545454545454545
Classification Report:
              precision    recall  f1-score   support

        7758       1.00      1.00      1.00        18
        7762       0.60      0.63      0.61        68
        7763       0.43      0.23      0.30        13
        7765       0.93      0.76      0.84        50
        7771       0.33      0.10      0.15        10
        7800       0.75      0.88      0.81        24
        7801       0.50      0.86      0.63        14
        7803       1.00      0.17      0.29         6
        7804       0.56      0.59      0.58        68
        7821       0.80      1.00      0.89         4
        7825       0.29      0.33      0.31         6
        7830       0.60      0.80      0.69        15
        7833       0.80      0.40      0.53        20
       57663       0.50      0.90      0.64        10
       57671       0.80      1.00      0.89         4

    accuracy                           0.65       330
   macro avg       0.66     

# Random Forest Without Classification

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Cargar el dataset
file_path = 'cleaned_final_description_not_nan.csv'
df = pd.read_csv(file_path)

# Filtrar clases de acc_classification con pocas muestras
min_samples = 15  # Define el mínimo de muestras por clase
class_counts = df['acc_classification'].value_counts()
filtered_classes = class_counts[class_counts >= min_samples].index
df = df[df['acc_classification'].isin(filtered_classes)]

# Asegurarse de que la columna unit_total sea de tipo entero
df['unit_total'] = df['unit_total'].astype(int)

# Definir las características (features) y la etiqueta (label)
X = df[['company_tid', 'unit_total']]
y = df['acc_classification']

# Dividir el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el preprocesador usando OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['company_tid']),
        ('num', 'passthrough', ['unit_total'])
    ])

# Crear el pipeline con el preprocesador y el modelo de Random Forest
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Entrenar el modelo
pipeline.fit(X_train, y_train)

# Hacer predicciones
y_pred = pipeline.predict(X_test)

# Identificar las clases presentes en y_test y y_pred
unique_classes = sorted(set(y_test) | set(y_pred))
target_names = [str(cls) for cls in unique_classes]

# Mostrar métricas de desempeño
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=target_names)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.6363636363636364
Classification Report:
              precision    recall  f1-score   support

        7758       1.00      1.00      1.00        18
        7762       0.64      0.65      0.64        68
        7763       0.40      0.31      0.35        13
        7765       0.93      0.78      0.85        50
        7771       0.00      0.00      0.00        10
        7800       0.65      0.83      0.73        24
        7801       0.47      0.57      0.52        14
        7803       1.00      0.17      0.29         6
        7804       0.59      0.60      0.59        68
        7821       0.50      0.50      0.50         4
        7825       0.00      0.00      0.00         6
        7830       0.56      0.67      0.61        15
        7833       0.67      0.50      0.57        20
       57663       0.50      0.90      0.64        10
       57671       0.80      1.00      0.89         4

    accuracy                           0.64       330
   macro avg       0.58     

# XGBoost WITHOUT CLASSIFICATION

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb

# Cargar el dataset
file_path = 'cleaned_final_description_not_nan.csv'
df = pd.read_csv(file_path)

# Filtrar clases de acc_classification con pocas muestras
min_samples = 15  # Define el mínimo de muestras por clase
class_counts = df['acc_classification'].value_counts()
filtered_classes = class_counts[class_counts >= min_samples].index
df = df[df['acc_classification'].isin(filtered_classes)]

# Asegurarse de que la columna unit_total sea de tipo entero
df['unit_total'] = df['unit_total'].astype(int)

# Codificar la columna de la variable objetivo
label_encoder_acc = LabelEncoder()
df['acc_classification'] = label_encoder_acc.fit_transform(df['acc_classification'])

# Definir las características (features) y la etiqueta (label)
X = df[['company_tid', 'unit_total']]
y = df['acc_classification']

# Dividir el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el preprocesador usando OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['company_tid']),
        ('num', 'passthrough', ['unit_total'])
    ])

# Crear el pipeline con el preprocesador y el modelo XGBoost
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(n_estimators=100, random_state=42))
])

# Entrenar el modelo
pipeline.fit(X_train, y_train)

# Hacer predicciones
y_pred = pipeline.predict(X_test)

# Identificar las clases presentes en y_test y y_pred
unique_classes = sorted(set(y_test) | set(y_pred))
target_names = [str(label_encoder_acc.inverse_transform([cls])[0]) for cls in unique_classes]

# Mostrar métricas de desempeño
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=target_names)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.5969696969696969
Classification Report:
              precision    recall  f1-score   support

        7758       1.00      0.94      0.97        18
        7762       0.57      0.59      0.58        68
        7763       0.50      0.38      0.43        13
        7765       0.85      0.70      0.77        50
        7771       0.00      0.00      0.00        10
        7800       0.55      0.71      0.62        24
        7801       0.53      0.71      0.61        14
        7803       0.00      0.00      0.00         6
        7804       0.51      0.56      0.53        68
        7821       0.67      0.50      0.57         4
        7825       0.12      0.17      0.14         6
        7830       0.59      0.67      0.62        15
        7833       0.71      0.50      0.59        20
       57663       0.44      0.80      0.57        10
       57671       1.00      1.00      1.00         4

    accuracy                           0.60       330
   macro avg       0.54     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# XGBOOST WITH CLASSIFICATION

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb

# Cargar el dataset
file_path = 'cleaned_final_description_not_nan.csv'
df = pd.read_csv(file_path)

# Filtrar clases de acc_classification con pocas muestras
min_samples = 15  # Define el mínimo de muestras por clase
class_counts = df['acc_classification'].value_counts()
filtered_classes = class_counts[class_counts >= min_samples].index
df = df[df['acc_classification'].isin(filtered_classes)]

# Asegurarse de que la columna unit_total sea de tipo entero
df['unit_total'] = df['unit_total'].astype(int)

# Codificar la columna de la variable objetivo
label_encoder_acc = LabelEncoder()
df['acc_classification'] = label_encoder_acc.fit_transform(df['acc_classification'])

# Codificar la columna classification
label_encoder_classification = LabelEncoder()
df['classification'] = label_encoder_classification.fit_transform(df['classification'])

# Definir las características (features) y la etiqueta (label)
X = df[['company_tid', 'classification', 'unit_total']]
y = df['acc_classification']

# Dividir el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el preprocesador usando OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['company_tid', 'classification']),
        ('num', 'passthrough', ['unit_total'])
    ])

# Crear el pipeline con el preprocesador y el modelo XGBoost
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(n_estimators=100, random_state=42))
])

# Entrenar el modelo
pipeline.fit(X_train, y_train)

# Hacer predicciones
y_pred = pipeline.predict(X_test)

# Identificar las clases presentes en y_test y y_pred
unique_classes = sorted(set(y_test) | set(y_pred))
target_names = [str(label_encoder_acc.inverse_transform([cls])[0]) for cls in unique_classes]

# Mostrar métricas de desempeño
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=target_names)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.6151515151515151
Classification Report:
              precision    recall  f1-score   support

        7758       1.00      0.94      0.97        18
        7762       0.59      0.57      0.58        68
        7763       0.44      0.31      0.36        13
        7765       0.83      0.76      0.79        50
        7771       0.67      0.20      0.31        10
        7800       0.55      0.75      0.63        24
        7801       0.60      0.64      0.62        14
        7803       1.00      0.67      0.80         6
        7804       0.54      0.59      0.56        68
        7821       0.50      0.50      0.50         4
        7825       0.12      0.17      0.14         6
        7830       0.50      0.53      0.52        15
        7833       0.62      0.40      0.48        20
       57663       0.53      0.90      0.67        10
       57671       0.80      1.00      0.89         4

    accuracy                           0.62       330
   macro avg       0.62     

# SVM WITH CLASSIFICATION

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC

# Cargar el dataset
file_path = 'cleaned_final_description_not_nan.csv'
df = pd.read_csv(file_path)

# Filtrar clases de acc_classification con pocas muestras
min_samples = 15  # Define el mínimo de muestras por clase
class_counts = df['acc_classification'].value_counts()
filtered_classes = class_counts[class_counts >= min_samples].index
df = df[df['acc_classification'].isin(filtered_classes)]

# Asegurarse de que la columna unit_total sea de tipo entero
df['unit_total'] = df['unit_total'].astype(int)

# Codificar la columna de la variable objetivo
label_encoder_acc = LabelEncoder()
df['acc_classification'] = label_encoder_acc.fit_transform(df['acc_classification'])

# Codificar la columna classification
label_encoder_classification = LabelEncoder()
df['classification'] = label_encoder_classification.fit_transform(df['classification'])

# Definir las características (features) y la etiqueta (label)
X = df[['company_tid', 'classification', 'unit_total']]
y = df['acc_classification']

# Dividir el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el preprocesador usando OneHotEncoder y StandardScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['company_tid', 'classification']),
        ('num', StandardScaler(), ['unit_total'])
    ])

# Crear el pipeline con el preprocesador y el modelo SVM
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='linear', random_state=42))
])

# Entrenar el modelo
pipeline.fit(X_train, y_train)

# Hacer predicciones
y_pred = pipeline.predict(X_test)

# Identificar las clases presentes en y_test y y_pred
unique_classes = sorted(set(y_test) | set(y_pred))
target_names = [str(label_encoder_acc.inverse_transform([cls])[0]) for cls in unique_classes]

# Mostrar métricas de desempeño
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=target_names)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.5878787878787879
Classification Report:
              precision    recall  f1-score   support

        7758       0.90      1.00      0.95        18
        7762       0.60      0.82      0.70        68
        7763       0.38      0.62      0.47        13
        7765       0.72      0.36      0.48        50
        7771       1.00      0.10      0.18        10
        7800       0.35      0.88      0.50        24
        7801       0.56      0.36      0.43        14
        7803       1.00      0.67      0.80         6
        7804       0.69      0.40      0.50        68
        7821       0.00      0.00      0.00         4
        7825       0.00      0.00      0.00         6
        7830       0.77      0.67      0.71        15
        7833       0.63      0.60      0.62        20
       57663       0.53      1.00      0.69        10
       57671       1.00      1.00      1.00         4

    accuracy                           0.59       330
   macro avg       0.61     

# Regresión Logística

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

# Cargar el dataset
file_path = 'cleaned_final_description_not_nan.csv'
df = pd.read_csv(file_path)

# Filtrar clases de acc_classification con pocas muestras
min_samples = 15  # Define el mínimo de muestras por clase
class_counts = df['acc_classification'].value_counts()
filtered_classes = class_counts[class_counts >= min_samples].index
df = df[df['acc_classification'].isin(filtered_classes)]

# Asegurarse de que la columna unit_total sea de tipo entero
df['unit_total'] = df['unit_total'].astype(int)

# Codificar la columna de la variable objetivo
label_encoder_acc = LabelEncoder()
df['acc_classification'] = label_encoder_acc.fit_transform(df['acc_classification'])

# Codificar la columna classification
label_encoder_classification = LabelEncoder()
df['classification'] = label_encoder_classification.fit_transform(df['classification'])

# Definir las características (features) y la etiqueta (label)
X = df[['company_tid', 'classification', 'unit_total']]
y = df['acc_classification']

# Dividir el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el preprocesador usando OneHotEncoder y StandardScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['company_tid', 'classification']),
        ('num', StandardScaler(), ['unit_total'])
    ])

# Crear el pipeline con el preprocesador y el modelo Logistic Regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Entrenar el modelo
pipeline.fit(X_train, y_train)

# Hacer predicciones
y_pred = pipeline.predict(X_test)

# Identificar las clases presentes en y_test y y_pred
unique_classes = sorted(set(y_test) | set(y_pred))
target_names = [str(label_encoder_acc.inverse_transform([cls])[0]) for cls in unique_classes]

# Mostrar métricas de desempeño
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=target_names)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.5757575757575758
Classification Report:
              precision    recall  f1-score   support

        7758       1.00      1.00      1.00        18
        7762       0.58      0.82      0.68        68
        7763       0.38      0.62      0.47        13
        7765       0.70      0.38      0.49        50
        7771       1.00      0.10      0.18        10
        7800       0.34      0.83      0.49        24
        7801       0.56      0.36      0.43        14
        7803       1.00      0.67      0.80         6
        7804       0.60      0.46      0.52        68
        7821       0.00      0.00      0.00         4
        7825       0.00      0.00      0.00         6
        7830       0.75      0.40      0.52        15
        7833       0.80      0.40      0.53        20
       57663       0.53      1.00      0.69        10
       57671       1.00      1.00      1.00         4

    accuracy                           0.58       330
   macro avg       0.62     

# Light GBM

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import lightgbm as lgb

# Cargar el dataset
file_path = 'cleaned_final_description_not_nan.csv'
df = pd.read_csv(file_path)

# Filtrar clases de acc_classification con pocas muestras
min_samples = 15  # Define el mínimo de muestras por clase
class_counts = df['acc_classification'].value_counts()
filtered_classes = class_counts[class_counts >= min_samples].index
df = df[df['acc_classification'].isin(filtered_classes)]

# Asegurarse de que la columna unit_total sea de tipo entero
df['unit_total'] = df['unit_total'].astype(int)

# Codificar la columna de la variable objetivo
label_encoder_acc = LabelEncoder()
df['acc_classification'] = label_encoder_acc.fit_transform(df['acc_classification'])

# Codificar la columna classification
label_encoder_classification = LabelEncoder()
df['classification'] = label_encoder_classification.fit_transform(df['classification'])

# Definir las características (features) y la etiqueta (label)
X = df[['company_tid', 'classification', 'unit_total']]
y = df['acc_classification']

# Dividir el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el preprocesador usando OneHotEncoder y StandardScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['company_tid', 'classification']),
        ('num', StandardScaler(), ['unit_total'])
    ])

# Crear el pipeline con el preprocesador y el modelo LightGBM
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lgb.LGBMClassifier(random_state=42))
])

# Entrenar el modelo
pipeline.fit(X_train, y_train)

# Hacer predicciones
y_pred = pipeline.predict(X_test)

# Identificar las clases presentes en y_test y y_pred
unique_classes = sorted(set(y_test) | set(y_pred))
target_names = [str(label_encoder_acc.inverse_transform([cls])[0]) for cls in unique_classes]

# Mostrar métricas de desempeño
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=target_names)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 1316, number of used features: 21
[LightGBM] [Info] Start training from score -2.962844
[LightGBM] [Info] Start training from score -1.668923
[LightGBM] [Info] Start training from score -3.250526
[LightGBM] [Info] Start training from score -1.972866
[LightGBM] [Info] Start training from score -3.398162
[LightGBM] [Info] Start training from score -2.165072
[LightGBM] [Info] Start training from score -2.962844
[LightGBM] [Info] Start training from score -4.985128
[LightGBM] [Info] Start training from score -1.793280
[LightGBM] [Info] Start training from score -4.004298
[LightGBM] [Info] Start training from score -3.850148
[LightGBM] [Info] Start training from score -2.851619
[LightGBM] [Info] Start training from score -2.948246
[LightGBM] 