In [1]:
import pandas as pd
import joblib

In [2]:
import sys
sys.path.append('..')

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

In [4]:
class PerCatLimTransform(BaseEstimator, TransformerMixin):
    
    def __init__(self, percentage_limit: float, number_allowed_classes: int):
        
        assert 0 < percentage_limit < 1, 'precentage_limit needs to be between 0 and 1'
        assert isinstance(number_allowed_classes, int), 'number_allowed_classes needs to be an int'
        
        self.percentage_limit = percentage_limit
        self.number_allowed_classes= number_allowed_classes
        self.dict_accepted_values= {} 
        
    def fit(self, X, y= None):
        
        for col in X.columns:
            if isinstance(X[col].dtype, pd.CategoricalDtype):
                
                s= np.round(
                    X[col].value_counts(
                        normalize= True,
                        sort= True,
                        ascending= False,
                        dropna= False
                    )\
                        .cumsum(skipna= False),
                    decimals= 4
                )
                if len(s.index) < self.number_allowed_classes:
                    allowed_classes= s.index.to_list()
                else:
                    allowed_classes_per= list(s[s < self.percentage_limit].index)
                    last_value_index=s[s < self.percentage_limit].index[-1]
                    last_value= s[s < self.percentage_limit][-1]
                    
                    if not pd.isna(last_value) and allowed_classes_per.index(last_value_index) <= self.number_allowed_classes:
                        allowed_classes= allowed_classes_per
                    elif not pd.isna(last_value): 
                        allowed_classes= allowed_classes_per[:self.number_allowed_classes] 

                self.dict_accepted_values[col]= allowed_classes
                
            else:
                continue
                
        return self
        
    def transform(self, X):
        
        output_df= X.copy()
        
        for col in X.columns:
            if col in self.dict_accepted_values.keys():
                func= lambda value: 'OTRO' if not len(set(self.dict_accepted_values[col]) & set([value])) and not pd.isna(value) else value
                output_df.loc[:, col]= X.loc[:, col].apply(func)
        
        return output_df

In [64]:
X_train= pd.read_csv(
    filepath_or_buffer= '../data/training_data_features.csv'
).set_index('ID_CLIENT')
y_train= pd.read_csv(
    filepath_or_buffer= '../data/training_data_target.csv'
).set_index('ID_CLIENT')
X_test= pd.read_csv(
    filepath_or_buffer= '../data/test_data_features.csv'
).set_index('ID_CLIENT')
y_test= pd.read_csv(
    filepath_or_buffer= '../data/test_data_target.csv'
).set_index('ID_CLIENT')

In [6]:
column_transformer= joblib.load('../model/column_transformer.pkl')
categorical_delimiter= joblib.load('../model/categorical_delimiter.pkl')

In [7]:
categorical_delimiter

In [8]:
X_train_res= categorical_delimiter.transform(X_train)

In [27]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV

In [10]:
RandomForestClassifier?


In [22]:
param_grid_rf= dict(
    criterion= ['gini', 'entropy'],
    max_depth= [1, 3, 6],
    n_estimators= [50, 100, 150],
    random_state= [42],
    class_weight= ["balanced"]
)

In [23]:
random_search_rf= RandomizedSearchCV(
    estimator= RandomForestClassifier(),
    param_distributions= param_grid_rf,
    n_iter= 10,
    scoring= "recall",
    n_jobs=-1,
    cv= 5,
    verbose= 5,
    random_state= 42
)

In [24]:
X_train_res_trans= column_transformer.transform(X_train_res)

In [49]:
import numpy as np

In [65]:
random_search_rf.fit(X_train_res_trans, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [112]:
rf_best= random_search_rf.best_estimator_

In [115]:
y_proba_rf= rf_best.predict_proba(X_test_res_trans)
y_pred_rf= rf_best.predict(X_test_res_trans)

In [116]:
print(
    classification_report(
        y_pred= y_pred_rf,
        y_true= y_test
    )
)

              precision    recall  f1-score   support

           0       0.80      0.56      0.66      7336
           1       0.33      0.60      0.43      2664

    accuracy                           0.57     10000
   macro avg       0.57      0.58      0.54     10000
weighted avg       0.67      0.57      0.60     10000



In [66]:
from xgboost import XGBClassifier

In [80]:
param_grid_xgboost= dict(
    n_estimators= [50, 100, 150],
    max_depth= [1, 3, 6],
    learning_rate= [0.05, 0.1, 0.3, 0.5],
    random_state= [42],
    scale_pos_weight = [3.5, 10, ]
)

In [117]:
random_search_xgboost= RandomizedSearchCV(
    estimator= XGBClassifier(),
    param_distributions= param_grid_xgboost,
    n_iter= 10,
    scoring= "roc_auc",
    n_jobs=-1,
    cv= 5,
    verbose= 5,
    random_state= 42
)

In [118]:
random_search_xgboost.fit(X_train_res_trans, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [129]:
random_search_xgboost.best_score_

0.6304164918523417

In [130]:
random_search_xgboost.best_params_

{'scale_pos_weight': 3.5,
 'random_state': 42,
 'n_estimators': 100,
 'max_depth': 3,
 'learning_rate': 0.1}

In [106]:
from sklearn.metrics import roc_auc_score, classification_report

In [121]:
xgboost_best= random_search_xgboost.best_estimator_

In [122]:
X_test_res= categorical_delimiter.transform(X_test)
X_test_res_trans= column_transformer.transform(X_test_res)



In [123]:
y_predict= xgboost_best.predict(X_test_res_trans)
y_predict_proba= xgboost_best.predict_proba(X_test_res_trans)

In [124]:
print(classification_report(y_pred= y_predict, y_true= y_test))

              precision    recall  f1-score   support

           0       0.84      0.37      0.51      7336
           1       0.32      0.80      0.45      2664

    accuracy                           0.48     10000
   macro avg       0.58      0.59      0.48     10000
weighted avg       0.70      0.48      0.50     10000



From now, we will keep this XGBoost model to work on the API. Later we will come back and improve it

In [135]:
import joblib

In [140]:
joblib.dump(xgboost_best, '../model/xgboost_predictor.pkl')

['../model/xgboost_predictor.pkl']