In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
import src.utils.helper_functions as helpers
import src.utils.general_path as general_path

import pandas as pd
import numpy as np
import pprint as pp
import joblib
from collections import defaultdict
from sklearn.model_selection import train_test_split

# Carga de Datos

In [3]:
data = pd.read_csv('../'+general_path.PROCESSED_DATA_PATH + 'processed_data.csv')
data.head()

Unnamed: 0,label,text
0,1,presenta historia clinica cehani fechada indic...
1,1,usuaria cosmitet ltda refiere doctora aithza c...
2,1,usuario afiliado nueva eps regimen contributiv...
3,1,pasado enero radique informe respectivos entes...
4,1,denuncia barrera administrativa eapb salud tot...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28817 entries, 0 to 28816
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   28817 non-null  int64 
 1   text    28814 non-null  object
dtypes: int64(1), object(1)
memory usage: 450.4+ KB


In [5]:
data = helpers.df_preprocess(data)
data.info()

Elementos existentes: 28817
Elementos restantes: 28814
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28814 entries, 0 to 28813
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   28814 non-null  int64 
 1   text    28814 non-null  object
dtypes: int64(1), object(1)
memory usage: 450.3+ KB


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    data['text'], data['label'],
    test_size = 0.2,
    random_state = 42,
    stratify = data['label'])

In [7]:
print(f'Tamaño X_train: {X_train.shape}')
print(f'Tamaño X_test: {X_test.shape}')
print(f'Tamaño y_train: {y_train.shape}')
print(f'Tamaño y_test: {y_test.shape}')

Tamaño X_train: (23051,)
Tamaño X_test: (5763,)
Tamaño y_train: (23051,)
Tamaño y_test: (5763,)


In [8]:
tfidf_vectorizer = helpers.get_object('../'+general_path.OBJECTS_PATH,'tfidf_vectorizer.pkl')
bow = helpers.get_object('../'+general_path.OBJECTS_PATH,'count_vectorizer.pkl')

Objeto cargado desde: ../src/objects/
Objeto cargado desde: ../src/objects/


In [9]:
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

X_train_bow = bow.transform(X_train)
X_test_bow = bow.transform(X_test)

# Modelamiento

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, make_scorer

from xgboost import XGBClassifier

In [11]:
def apply_grid_search_model(model_name, models_dict, models_result, X, y, X_test, y_test, scoring, value_metric):
    for model_key, model_value in models_dict.items():
      if model_key == model_name:
        grid_search = GridSearchCV(
            estimator=model_value[0],
            param_grid=model_value[1],
            # cv=3,
            scoring=scoring,
            refit='f1')

        grid_search.fit(X, y)

        y_pred = grid_search.best_estimator_.predict(X_test)

        models_result[model_key] = {
            'train' : {
                'accuracy' :  round(grid_search.cv_results_['mean_test_accuracy'][0],2),
                'precision' :  round(grid_search.cv_results_['mean_test_precision'][0],2),
                'recall' :  round(grid_search.cv_results_['mean_test_recall'][0],2),
                'f1' :  round(grid_search.cv_results_['mean_test_f1'][0],2)
            },

            'test' :{
                'accuracy' :  round(accuracy_score(y_test,y_pred),2),
                'precision' :  round(precision_score(y_test,y_pred, average='weighted'),2),
                'recall' :  round(recall_score(y_test,y_pred, average='weighted'),2),
                'f1' :  round(f1_score(y_test,y_pred, average='weighted'),2)
            }

        }

        models_dict[model_key] = [
            grid_search.best_estimator_,
            grid_search.best_params_
        ]

        model_name = f"recall_{model_key}_train_{str(models_result[model_key]['train'][value_metric])}_test_{str(models_result[model_key]['test'][value_metric])}"
        joblib.dump(grid_search.best_estimator_, f'{'../'+general_path.MODELS_PATH}{model_name}.joblib')

        helpers.save_log('../'+ general_path.LOGS_PATH + 'log.txt', model_name, grid_search.best_params_)


    return models_dict, models_result

In [12]:
param_grid_clasificacion = {
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7]
    },
    'RandomForestClassifier': {
        'n_estimators': [10, 50, 100]
    },
    'XGBClassifier': {
        'n_estimators': [50]
    }
}

param_grid_clasificacion = {
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7]
    },
    'RandomForestClassifier': {
        'n_estimators': [10, 50, 100, 150]
    },
    'XGBClassifier': {
        'n_estimators': [50, 60, 70, 80, 90, 100]
    }
}


param_grid_clasificacion = {
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'brute']
    },
    'DecisionTreeClassifier': {
        'max_depth': [None, 5, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
    },
    'SVC': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto']
    },
    'GaussianNB': {
        'var_smoothing': [1e-9, 1e-8, 1e-7]
    },
    'MLPClassifier': {
        'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
        'activation': ['logistic', 'tanh', 'relu'],
        'solver': ['lbfgs', 'sgd', 'adam'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'invscaling', 'adaptive']
    },
    'XGBClassifier': {
        'n_estimators': [50, 60, 70, 80, 90, 100],
        'max_depth': [3, 6, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }
}


In [13]:
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

# Entrenamiento

## TF-IDF

In [15]:
models_tfidf_clasificacion = {
    'KNeighborsClassifier': [KNeighborsClassifier(), param_grid_clasificacion['KNeighborsClassifier']],
    'RandomForestClassifier': [RandomForestClassifier(), param_grid_clasificacion['RandomForestClassifier']],
    'XGBClassifier': [XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid_clasificacion['XGBClassifier']],
    'DecisionTreeClassifier': [DecisionTreeClassifier(), param_grid_clasificacion['DecisionTreeClassifier']],
    'SVC': [SVC(), param_grid_clasificacion['SVC']],
    'GaussianNB': [GaussianNB(), param_grid_clasificacion['GaussianNB']],
    'MLPClassifier': [MLPClassifier(), param_grid_clasificacion['MLPClassifier']],
}

In [16]:
tdidf_models_result = {}

In [17]:
models_tfidf_clasificacion, tdidf_models_result = apply_grid_search_model(
    'SVC',
    models_tfidf_clasificacion,
    tdidf_models_result,
    X_train_tfidf,
    y_train,
    X_test_tfidf,
    y_test,
    scoring,
    'recall')

pp.pprint(tdidf_models_result)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

{'SVC': {'test': {'accuracy': 0.7,
                  'f1': np.float64(0.7),
                  'precision': np.float64(0.7),
                  'recall': np.float64(0.7)},
         'train': {'accuracy': np.float64(0.65),
                   'f1': np.float64(0.64),
                   'precision': np.float64(0.68),
                   'recall': np.float64(0.65)}}}


In [None]:
def get_models_models_weights(models_results, dataset, metric):
    score_sum = sum(value[dataset][metric] for value in models_results.values())
    models_weights = {key: round((value[dataset][metric] / score_sum) * 100,2) for key, value in models_results.items()}
    return models_weights

In [None]:
def get_weighted_prediction(models_result, models, dataset, metric, X_test):

    models_weights = get_models_models_weights(models_result, dataset, metric)

    y_pred = []

    for pos in range (0, X_test.shape[0]):

        weighted_prediction = defaultdict(float)

        for key, (model, scores) in models.items():

            prediccion = model.predict(X_test[pos])[0]
            weighted_prediction[prediccion] += models_weights[key]

        final_label = max(weighted_prediction, key=weighted_prediction.get)

        y_pred.append(final_label)

    return np.array(y_pred)

In [None]:
# y_weighteds_predictions = get_weighted_prediction(tdidf_models_result, models_tfidf_clasificacion, 'test', 'f1', X_test_tfidf)
# print(y_weighteds_predictions.shape)

In [None]:
# f1_value = f1_score(y_true = y_encoder_test, y_pred = y_weighteds_predictions, average='weighted')
# print(round(f1_value,2))