In [15]:
import warnings
warnings.filterwarnings('ignore')

# functions to preprocess and viz data
import pandas as pd
import numpy as np
import os

# function to process data
from sklearn.metrics import confusion_matrix, make_scorer, f1_score
from scipy.optimize import differential_evolution
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn import metrics
from feature_engine.encoding import MeanEncoder, RareLabelEncoder, CountFrequencyEncoder, OneHotEncoder
from feature_engine.selection import DropFeatures
from feature_engine.imputation import AddMissingIndicator, CategoricalImputer
from feature_engine.imputation import ArbitraryNumberImputer, MeanMedianImputer
import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.mixture import GaussianMixture
from src.ml.transformer import DataframeColumnDuplicateTransformer

# models to import 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
import joblib

In [6]:
import os
os.chdir('C:\\Users\\2160001230\\OneDrive - Via Varejo S.A\\Área de Trabalho\\projeto\\entel-reto1-2022')

In [10]:
data_all = (pd.read_csv('./data/transformed_data/entel_book_features_reto1.csv/part-00000-d8e0c342-dd9b-4fb3-bcba-72b4b31db409-c000.csv')
           )
x_train = data_all.loc[data_all['NUMPERIODO'].isin([202201,202202,202203])].copy().drop('TARGET',axis=1)
y_train = data_all.loc[data_all['NUMPERIODO'].isin([202201,202202,202203]), ['TARGET']]
x_test = data_all.loc[data_all['NUMPERIODO']==202204].copy().drop('TARGET',axis=1)

In [11]:
def train_get_score_cv(x_train, y_train, x_test, models, k, pipeline_engine, random_state = 199):

    kf = StratifiedKFold(n_splits=k, shuffle = True, random_state = random_state)
    result = np.zeros((len(models), 4))
    result_pred_test = []
    result_pred_train = []
    
    for i,model in enumerate(models.keys()):
        
        pred = []
        pred_test = []
        label_test = []
        label_train = []
        roc_auc = []
        cut = []
        learner = models[model]
        for fold, (id_train, id_test) in enumerate(kf.split(x_train, y_train)):

            Xt = x_train.iloc[id_train]; yt = y_train.iloc[id_train]
            Xv = x_train.iloc[id_test]; yv = y_train.iloc[id_test]
            xtest = x_test.copy()
            if pipeline_engine != None:
                preprocess_data_cv = pipeline_engine.fit(Xt, yt)
    
                Xt = preprocess_data_cv.transform(Xt)
                Xv = preprocess_data_cv.transform(Xv)
                xtest = preprocess_data_cv.transform(x_test)
                
            learner.fit(Xt, yt.values)
            prediction = pd.Series(learner.predict_proba(Xv)[:, -1], index=Xv.index)  
            roc_auc.append(metrics.roc_auc_score(y_train.loc[prediction.index], prediction))
            prediction_test = pd.Series(learner.predict_proba(xtest)[:, -1], index=x_test.index, 
                                       name="fold_" + str(fold))   
            pred.append(prediction)
            pred_test.append(prediction_test)
                                 
        pred = pd.concat(pred)
        pred_test = pd.concat(pred_test, axis =1).mean(axis=1)
        AUC = metrics.roc_auc_score(y_train.loc[pred.index], pred)
        print(f'auc train oot: {AUC}, cv: {np.mean(roc_auc)} and std: {np.std(roc_auc)} ')
    return pred_test


In [12]:
random_state=123
to_drop = ['nro_telefono_hash', 'NUMPERIODO', 'nro_documento_hash']
cat_var = x_train.copy().drop(to_drop, axis=1).select_dtypes('object').columns.to_list()

arbitrary_imputer = CategoricalImputer(variables=cat_var)
rare_encoder = RareLabelEncoder(tol=0.1, n_categories=2, variables=cat_var,
                           replace_with=-999, ignore_format = True)
duplicate_columns = DataframeColumnDuplicateTransformer(columns=cat_var)
mean_encoder = MeanEncoder(variables=cat_var, ignore_format = True)
count_encoder = ce.CountEncoder(cols=[column_i +'_count' for column_i in cat_var])
drop_Features = DropFeatures(features_to_drop = to_drop)

pipe = Pipeline([('ReplaceNa', arbitrary_imputer),
                 ('RareLabelEncoder', rare_encoder),
                 ('DataframeFunctionTransformer', duplicate_columns),
                  ('MeanEncoder', mean_encoder),
                  ('CountEncoder',count_encoder),
                 ('DropFeatures', drop_Features)])


80 variables

In [8]:
model_pipe ={'lgbm': Pipeline( [
         ('pipe1', pipe),
         ('lgbm',LGBMClassifier(**{'learning_rate': 0.028811377742693784, 'n_estimators': 1965, 'num_leaves': 32,
                                   'max_depth': 3, 'reg_alpha': 45.10805558634257, 'reg_lambda': 51.407152629520105,
                                   'min_data_in_leaf': 155, 'max_bin': 967, 'feature_fraction': 0.9654714978974305}))
]) }

In [9]:
y_test = train_get_score_cv(x_train, y_train, x_test, model_pipe, pipeline_engine = None, k=5)

auc train oot: 0.7986489623314398, cv: 0.7986822450398033 and std: 0.003619471682540225 


In [10]:
prediction = x_test[['nro_telefono_hash']].join(y_test.rename('TARGET')).set_index('nro_telefono_hash')

In [11]:
prediction.to_csv('score_v6.csv')

In [13]:
model = Pipeline( [
         ('pipe1', pipe),
         ('lgbm',LGBMClassifier(**{'random_state':12345,'learning_rate': 0.04124640777998017, 'n_estimators': 976, 
                                   'num_leaves': 14, 'max_depth': 3, 'reg_alpha': 44.82058359639354,
                                   'reg_lambda': 56.82345490694187, 'min_data_in_leaf': 557, 'max_bin': 1103}))
     ]) 

In [14]:
model.fit(x_train, y_train.values)



In [16]:
joblib.dump(model, 'models/pipeline_model.pkl')

['models/pipeline_model.pkl']

In [15]:
prediction = x_test[['nro_telefono_hash']].join( pd.Series(model.predict_proba(x_test)[:, -1], index=x_test.index).rename('TARGET')).set_index('nro_telefono_hash')
prediction.to_csv('score_v7.csv')