In [1]:
import os
import sys
from pathlib import Path
sys.path.append(str(Path('/home/fedecano/AnyoneAI/proyecto-final-anyoneai/scripts/evaluation.py').parent.parent))

import joblib
import numpy as np
# Libraries importing
import matplotlib.pyplot as plt

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler

#from lightgbm import LGBMClassifier
#import xgboost as xgb

from scripts import evaluation
SEED = 1 # GLOBAL CONSTANTS


In [2]:
# Read training data
data = pd.read_csv('../data/raw_modeling_data.csv', index_col=0)
# Take a look on data inside the training dataset
data.head(10)

Unnamed: 0_level_0,PAYMENT_DAY,APPLICATION_SUBMISSION_TYPE,POSTAL_ADDRESS_TYPE,SEX,MARITAL_STATUS,QUANT_DEPENDANTS,STATE_OF_BIRTH,NACIONALITY,RESIDENCIAL_STATE,FLAG_RESIDENCIAL_PHONE,...,COMPANY,PROFESSIONAL_STATE,FLAG_PROFESSIONAL_PHONE,PROFESSIONAL_PHONE_AREA_CODE,MONTHS_IN_THE_JOB,PROFESSION_CODE,OCCUPATION_TYPE,PRODUCT,AGE,TARGET_LABEL_BAD=1
ID_CLIENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,WEB,1,F,6.0,1,RN,1,RN,Y,...,N,NO_JOB,N,NO_DATA,0,9.0,4.0,1,32,1
2,15,CARGA,1,F,2.0,0,RJ,1,RJ,Y,...,Y,NO_JOB,N,NO_DATA,0,11.0,4.0,1,34,1
3,5,WEB,1,F,2.0,0,RN,1,RN,Y,...,N,NO_JOB,N,NO_DATA,0,11.0,,1,27,0
4,20,WEB,1,F,2.0,0,PE,1,PE,N,...,N,NO_JOB,N,NO_DATA,0,,,1,61,0
5,10,WEB,1,M,2.0,0,RJ,1,RJ,Y,...,N,NO_JOB,N,NO_DATA,0,9.0,5.0,1,48,1
6,10,,1,M,2.0,0,MG,1,MG,Y,...,Y,MG,N,NO_DATA,0,9.0,2.0,2,40,1
7,15,CARGA,1,F,2.0,2,BA,1,BA,Y,...,N,NO_JOB,N,NO_DATA,0,11.0,4.0,1,40,1
8,25,WEB,1,F,1.0,0,MG,1,SP,N,...,Y,SP,Y,5,0,11.0,2.0,1,28,0
9,15,,1,F,1.0,0,SP,1,SP,Y,...,Y,NO_JOB,N,NO_DATA,0,0.0,2.0,2,31,0
10,5,,1,F,1.0,0,RS,1,RS,Y,...,Y,RS,Y,54,0,9.0,2.0,1,41,0


In [3]:
# Read the numerical and categorical features txt files
num_features = []
with open('../data/numerical_features.txt', 'r') as f:
    for line in f:
        # Read line by line and append top empty list
        num_features.append(line.split('\n')[0])

cat_features = []
with open('../data/categorical_features.txt', 'r') as f:
    for line in f:
        # Read line by line and append top empty list
        cat_features.append(line.split('\n')[0])

In [4]:
# Convert dtypes to the right ones
# First use suggested dtypes from pandas core
data = data.convert_dtypes()
# Remove Target variable from features
cat_features.remove('TARGET_LABEL_BAD=1')
# Then use the lists generated before
for col in cat_features:
    # First to string
    data[col] = data[col].astype('string')
    data[col] = pd.Categorical(data[col])

# As Pandas could introduce pd.NA values in some features
# when converting them to categorical, let's replace them with np.nan by casting
# int columns to float32
cols_to_float = data.select_dtypes(include='int').columns
data[cols_to_float] = data[cols_to_float].astype(dtype='float32')

In [5]:
# Split data into features and label
X, y = data.drop(columns=['TARGET_LABEL_BAD=1']), data['TARGET_LABEL_BAD=1']
# Split dataset into Train and Test
X_train_0, X_test, y_train_0, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)
# Split train datasets into train and validation datasets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_0, y_train_0, test_size=0.2, random_state=SEED
)

In [6]:
## Build processing pipeline
# Create individual transformers
num_transformer = Pipeline(
        steps=[
            ('imputer' , SimpleImputer(strategy='median')),
            ('scaler'  , RobustScaler())
        ]
)

cat_transformer = Pipeline(
        steps=[
            ('imputer2', SimpleImputer(missing_values=pd.NA,
                                      strategy='most_frequent')),
            ('encoder' , OneHotEncoder(drop='if_binary',
                                       #dtype='int8',
                                       handle_unknown='ignore',
                                       sparse=False))
        ]
)

# Ensemble all the transformers
preprocessor = ColumnTransformer(
        transformers=[
            ('num' , num_transformer , num_features),
            ('cat' , cat_transformer , cat_features)
        ]
)

In [7]:
# Save data without pre-processing to csv
pd.DataFrame(X_test).to_csv(
    path_or_buf= "../data/X_test_unprepossessing.csv"
)
pd.DataFrame(X_test).to_csv(
    path_or_buf= "../data/X_train_unprepossessing.csv"
)

In [8]:
# Use ColumnTransformer 'preprocessor' to process data
# Train data
X_train_pre   = preprocessor.fit_transform(X=X_train)
X_train_0_pre = preprocessor.transform(X=X_train_0)
# Validation data
X_val_pre = preprocessor.transform(X=X_val)
# Test data
X_test_pre = preprocessor.transform(X=X_test)



In [9]:
# Save preprocessor to a file
joblib.dump(preprocessor, '../model/preprocessor.pkl')

['../model/preprocessor.pkl']

In [13]:
preprocessor_loaded = joblib.load('../model/preprocessor.pkl')
var = preprocessor_loaded.transform(X=X_train)

In [14]:
print(X_train_0_pre.shape)
print(X_train_pre.shape)
print(X_val_pre.shape)
print(X_test_pre.shape)

(40000, 306)
(32000, 306)
(8000, 306)
(10000, 306)


In [11]:
len(cat_features)+len(num_features)

32

In [12]:
# Save data to csv
pd.DataFrame(X_train_pre).to_csv(
    path_or_buf= "../data/X_train.csv"
)

pd.DataFrame(X_val_pre).to_csv(
    path_or_buf= "../data/X_val.csv"
)

pd.DataFrame(X_test_pre).to_csv(
    path_or_buf= "../data/X_test.csv"
)

pd.DataFrame(y_train).to_csv(
    path_or_buf= "../data/y_train.csv"
)

pd.DataFrame(y_val).to_csv(
    path_or_buf= "../data/y_val.csv"
)

pd.DataFrame(y_test).to_csv(
    path_or_buf= "../data/y_test.csv"
)