## Machine Learning Model Building Pipeline: Wrapping up for Deployment

In [72]:
# to handle datasets
import pandas as pd
import numpy as np

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import MinMaxScaler

# to build the models
from sklearn.ensemble import RandomForestClassifier

# to evaluate the models
from sklearn.metrics import roc_auc_score

# to persist the model and the scaler
from sklearn.externals import joblib

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

# to ignore warnings
import warnings
warnings.simplefilter('ignore')

In [4]:
# load dataset
data = pd.read_csv(r'C:\Users\dasantos\MyProjects\Credit_Scoring\packages\model\datasets\loans_data.csv')
print(data.shape)
data.head()

(27917, 32)


Unnamed: 0,ID,Status,Amount,ApplicationSignedHour,ApplicationSignedWeekday,City,Country,CreditScoreEsEquifaxRisk,DateOfBirth,DebtToIncome,Education,EmploymentDurationCurrentEmployer,EmploymentPosition,EmploymentStatus,ExistingLiabilities,Gender,HomeOwnershipType,IncomeFromPrincipalEmployer,IncomeTotal,Interest rate (APR),LoanDate,LoanDuration,MaritalStatus,NewCreditCustomer,NoOfPreviousLoansBeforeLoan,OccupationArea,UseOfLoan,VerificationType,WorkExperience,PreviousScore,Defaulted,DefaultDate
0,0,Current,5000.0,12,4,AESPA,EE,,21/11/1975,51.52,4.0,UpTo5Years,Worker,3.0,9,0.0,6.0,1000.0,1000.0,24.52,05/05/2015,60,4.0,False,1,8.0,0,1.0,15To25Years,0.0957,False,
1,2,Repaid,530.0,14,2,PÄRNU,EE,,25/10/1969,30.32,4.0,MoreThan5Years,SpecialistOfficeWorker,3.0,7,1.0,9.0,633.0,633.0,25.68,19/10/2015,60,1.0,True,0,1.0,2,4.0,15To25Years,0.1034,False,
2,3,Current,5500.0,22,4,TALLINN,EE,,22/11/1965,29.57,4.0,MoreThan5Years,Owner,5.0,1,0.0,1.0,550.0,550.0,21.62,02/09/2015,60,4.0,True,0,13.0,2,3.0,MoreThan25Years,0.0772,False,
3,4,Repaid,6900.0,15,3,KEHTNA,EE,,05/12/1983,45.75,2.0,MoreThan5Years,Worker,3.0,10,0.0,1.0,833.0,833.0,21.63,20/05/2015,60,3.0,False,1,19.0,7,4.0,5To10Years,0.0773,False,
4,5,Current,2655.0,10,3,KIVIÕLI,EE,,13/07/1980,25.4,4.0,MoreThan5Years,Worker,3.0,6,1.0,4.0,341.0,341.0,26.88,29/12/2015,60,3.0,True,0,17.0,2,4.0,5To10Years,0.1898,False,


## Separate dataset into train and test

Before beginning to engineer our features, it is important to separate our data intro training and testing set. This is to avoid over-fitting. There is an element of randomness in dividing the dataset, so remember to set the seed.

In [25]:
X_train, X_test, y_train, y_test = train_test_split(data, data.Defaulted, 
                                                   test_size = 0.2,
                                                   random_state = 0)
X_train.shape, X_test.shape

((22333, 32), (5584, 32))

## Selected features

In [15]:
# load selected features
features = pd.read_csv(r'C:\Users\dasantos\MyProjects\Credit_Scoring\packages\model\datasets\selected_features.csv', 
                       header=None)
features = [x for x in features[0]]
print('Number of features: ', len(features))

Number of features:  17


### Missing values

In [26]:
# make a list of the categorical variables that contain missing values
vars_with_na = [var for var in features if X_train[var].isnull().sum()>1 and X_train[var].dtypes=='O']

# print the variable name and the percentage of missing values
for var in vars_with_na:
    print(var, np.round(X_train[var].isnull().mean(), 3),  ' % missing values')

City 0.007  % missing values
EmploymentPosition 0.014  % missing values


#### Categorical Values

In [23]:
cat_with_na = [var for var in X_train.columns if X_train[var].isnull().sum()>1 and X_train[var].dtypes=='O']

In [27]:
mode_var_dict = {}
# replace the missing values
for var in cat_with_na:
    
    # calculate the mode
    mode_val = X_train[var].mode()[0]
    
    # we persist the mean in the dictionary
    mode_var_dict[var] = mode_val
    
    # train
    X_train[var+'_na'] = np.where(X_train[var].isnull(), 1, 0)
    X_train[var].fillna(mode_val, inplace=True)
    
    # test
    X_test[var+'_na'] = np.where(X_test[var].isnull(), 1, 0)
    X_test[var].fillna(mode_val, inplace=True)
    
# we save the dictionary for later
np.save('mode_var_dict.npy', mode_var_dict)
    
# check that we have no more missing values in the engineered variables
X_train[cat_with_na].isnull().sum()

City                                 0
CreditScoreEsEquifaxRisk             0
EmploymentDurationCurrentEmployer    0
EmploymentPosition                   0
WorkExperience                       0
DefaultDate                          0
dtype: int64

#### Numerical Variables

In [28]:
num_with_na = [var for var in X_train.columns if X_train[var].isnull().sum()>1 and X_train[var].dtypes!='O']

for var in num_with_na:
    print(var, np.round(X_train[var].isnull().mean(), 3),  ' missing values')

DebtToIncome 0.002  missing values
Education 0.002  missing values
EmploymentStatus 0.006  missing values
Gender 0.002  missing values
HomeOwnershipType 0.048  missing values
MaritalStatus 0.002  missing values
OccupationArea 0.003  missing values
VerificationType 0.002  missing values
PreviousScore 0.079  missing values


In [29]:
mean_var_dict = {}

# replace the missing values
for var in num_with_na:
    
    # calculate the mode
    mean_val = X_train[var].mean()
    
    # we persist the mean in the dictionary
    mean_var_dict[var] = mode_val
    
    # train
    X_train[var+'_na'] = np.where(X_train[var].isnull(), 1, 0)
    X_train[var].fillna(mean_val, inplace=True)
    
    # test
    X_test[var+'_na'] = np.where(X_test[var].isnull(), 1, 0)
    X_test[var].fillna(mean_val, inplace=True)
    
# we save the dictionary for later
np.save('mean_var_dict.npy', mean_var_dict)
    
# check that we have no more missing values in the engineered variables
X_train[num_with_na].isnull().sum()

DebtToIncome         0
Education            0
EmploymentStatus     0
Gender               0
HomeOwnershipType    0
MaritalStatus        0
OccupationArea       0
VerificationType     0
PreviousScore        0
dtype: int64

### Temporal Variables

In [38]:
temp_variables = ['DateOfBirth', 'LoanDate']

def get_age_month(df, vars_list):
    df = df.copy()
    
    for var in vars_list:
        
        if var == 'DateOfBirth':
            df[var] = pd.to_datetime(df[var])
            df[var] = pd.DatetimeIndex(df[var]).year
        if var == 'LoanDate':
            df[var] = pd.to_datetime(df[var])
            df[var] = pd.DatetimeIndex(df[var]).month
    
    return df

X_train = get_age_month(X_train, temp_variables)
X_test = get_age_month(X_test, temp_variables)

### Categorical Variables

#### Get top observations for variables with high cardinality

In [43]:
high_cardinality_vars = ['City', 'EmploymentPosition']

# high cardinality
for var in cat_vars:
    if len(X_train[var].unique() ) > 10:
        print(var, len(X_train[var].unique()))

City 3996
EmploymentPosition 2373


In [48]:
def find_top_labels(df, var):
    # finds the top labels in the dataset
    df = df.copy()
    top = df[var].value_counts().head(7).index
    return top

top_labels_dict = {}

for var in cat_vars:
    top_labels = find_top_labels(X_train, var)
    
    # we save the list in a dictionary
    top_labels_dict[var] = top_labels
    
    X_train[var] = np.where(X_train[var].isin(top_labels), X_train[var], 'Other')
    X_test[var] = np.where(X_test[var].isin(top_labels), X_test[var], 'Other')

# now we save the dictionary
np.save('TopLabels.npy', top_labels_dict)

#### Convert strings to numbers

In [61]:
cat_vars = [var for var in X_train.columns if X_train[var].dtypes == 'O']
cat_vars

['Status',
 'Country',
 'CreditScoreEsEquifaxRisk',
 'EmploymentDurationCurrentEmployer',
 'WorkExperience',
 'DefaultDate']

In [62]:
def replace_categories(train, test, var, target):
    ordered_labels = train.groupby([var])[target].mean().sort_values().index
    ordinal_label = {k:i for i, k in enumerate(ordered_labels, 0)} 
    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)

In [63]:
for var in cat_vars:
    replace_categories(X_train, X_test, var, 'Defaulted')

### Feature Scaling

In [65]:
# capture the target
y_train = X_train['Defaulted']
y_test = X_test['Defaulted']

In [66]:
# fit scaler
scaler = MinMaxScaler() # create an instance
scaler.fit(X_train[features]) #  fit  the scaler to the train set for later use

# we persist the model for future use
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [67]:
# transform the train and test set, and add on the Id and SalePrice variables
X_train = pd.DataFrame(scaler.transform(X_train[features]), columns=features)
X_test = pd.DataFrame(scaler.transform(X_test[features]), columns=features)

### Model

In [70]:
# train the model
rf = RandomForestClassifier(n_estimators=10, max_depth = 5, max_leaf_nodes = 45, random_state=0)
rf.fit(X_train, y_train)

# we persist the model for future use
joblib.dump(rf, 'random_forest_classification.pkl')

['random_forest_classification.pkl']

In [75]:
pred_proba = rf.predict_proba(X_train)
print('ROC_AUC_SCORE_TRAIN: ', roc_auc_score(y_train, pred_proba[:,1]))

pred_proba = rf.predict_proba(X_test)
print('ROC_AUC_SCORE_TEST: ', roc_auc_score(y_test, pred_proba[:,1]))

ROC_AUC_SCORE_TRAIN:  0.7867251062844451
ROC_AUC_SCORE_TEST:  0.787146899551947
