# Risk Scoring Model

This Notebook shows the construction of the Machine Learning model with the aim of assesing the assoiciated risk of a credit request.

## Python Libraries

In [32]:
# Data managing
import pandas as pd
import numpy as np

# Preprocesing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler

# Evaluation
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

# Optimization
from sklearn.model_selection import GridSearchCV

# ML Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB

In [2]:
%matplotlib inline

## Data Read

In [3]:
df = pd.read_csv('../Data/german_credit_data_labels.csv', index_col=0)
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [4]:
df.shape

(1000, 10)

In [5]:
df['Risk'].isnull().any()

False

In [6]:
df['Risk'].value_counts()

good    700
bad     300
Name: Risk, dtype: int64

## Data Analysis

In [7]:
df['Sex'].unique()

array(['male', 'female'], dtype=object)

In [8]:
df['Housing'].unique()

array(['own', 'free', 'rent'], dtype=object)

In [9]:
df['Purpose'].unique()

array(['radio/TV', 'education', 'furniture/equipment', 'car', 'business',
       'domestic appliances', 'repairs', 'vacation/others'], dtype=object)

In [10]:
df['Saving accounts'].value_counts()

little        603
moderate      103
quite rich     63
rich           48
Name: Saving accounts, dtype: int64

In [11]:
df['Checking account'].value_counts()

little      274
moderate    269
rich         63
Name: Checking account, dtype: int64

## Data Cleaning

In [12]:
# Null values, encoding categorical values....

In [13]:
df.dtypes[df.dtypes == np.object]

Sex                 object
Housing             object
Saving accounts     object
Checking account    object
Purpose             object
Risk                object
dtype: object

In [14]:
df.isnull().any()

Age                 False
Sex                 False
Job                 False
Housing             False
Saving accounts      True
Checking account     True
Credit amount       False
Duration            False
Purpose             False
Risk                False
dtype: bool

In [15]:
df.isnull().sum()[df.isnull().any()]

Saving accounts     183
Checking account    394
dtype: int64

In [16]:
(df.dtypes == np.object) & (df.isnull().any())

Age                 False
Sex                 False
Job                 False
Housing             False
Saving accounts      True
Checking account     True
Credit amount       False
Duration            False
Purpose             False
Risk                False
dtype: bool

In [17]:
df['Saving accounts'].fillna('null', inplace=True)
df['Checking account'].fillna('null', inplace=True)

In [18]:
df.loc[df['Risk'] == 'good', 'Risk'] = 0
df.loc[df['Risk'] == 'bad', 'Risk'] = 1
df['Risk'] = df['Risk'].astype(int)

In [19]:
df.loc[df['Sex'] == 'male', 'Sex'] = 0
df.loc[df['Sex'] == 'female', 'Sex'] = 1
df['Risk'] = df['Risk'].astype(int)

In [20]:
df = pd.get_dummies(df['Job'], prefix='Job').merge(df, left_index=True, right_index=True)
df = pd.get_dummies(df['Housing'], prefix='Housing').merge(df, left_index=True, right_index=True)
df = pd.get_dummies(df['Purpose'], prefix='Purpose').merge(df, left_index=True, right_index=True)
df = pd.get_dummies(df['Saving accounts'], prefix='Savings').merge(df, left_index=True, right_index=True)
df = pd.get_dummies(df['Checking account'], prefix='Checking').merge(df, left_index=True, right_index=True)
df.head()

Unnamed: 0,Checking_little,Checking_moderate,Checking_null,Checking_rich,Savings_little,Savings_moderate,Savings_null,Savings_quite rich,Savings_rich,Purpose_business,...,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,1,0,0,0,0,0,1,0,0,0,...,67,0,2,own,,little,1169,6,radio/TV,0
1,0,1,0,0,1,0,0,0,0,0,...,22,1,2,own,little,moderate,5951,48,radio/TV,1
2,0,0,1,0,1,0,0,0,0,0,...,49,0,1,own,little,,2096,12,education,0
3,1,0,0,0,1,0,0,0,0,0,...,45,0,2,free,little,little,7882,42,furniture/equipment,0
4,1,0,0,0,1,0,0,0,0,0,...,53,0,2,free,little,little,4870,24,car,1


## Feature Selection based on Correlation

In [21]:
df.corr().abs().filter(['Risk']).style.background_gradient()

Unnamed: 0,Risk
Checking_little,0.258333
Checking_moderate,0.119581
Checking_null,0.322436
Checking_rich,0.0440095
Savings_little,0.161007
Savings_moderate,0.0222555
Savings_null,0.129238
Savings_quite rich,0.0709541
Savings_rich,0.0857493
Purpose_business,0.0361291


In [23]:
features = ['Checking_little', 'Checking_null', 'Savings_little', 'Savings_null', 'Purpose_radio/TV', 'Housing_own',
            'Credit amount', 'Duration']
X = df[features].values
y = df['Risk'].values

## Evaluation

In [47]:
# Create the model and the hyperparameters range to optimize
less_than_1 = [.0001,.001,.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
more_than_1 = list(np.arange(1,11))
#classifier = MultinomialNB()
#params = {'clf__alpha': less_than_1}
#classifier = DecisionTreeClassifier()

#params = {'clf__max_depth': np.append(np.arange(1, 21), None), 'clf__criterion': ['gini', 'entropy'], 
#          'clf__splitter': ['best', 'random'], 'clf__class_weight': ['balanced', None], 
#          'clf__max_features': ['auto', 'log2', None]}
classifier = LogisticRegression(n_jobs=-1)
#classifier = LinearSVC(dual=False)
params = {'clf__penalty': ['l1','l2'], 'clf__tol': less_than_1, 'clf__C': np.append(less_than_1, np.arange(1,16))}
#classifier = KNeighborsClassifier()
#params = {'n_neighbors': np.arange(1,22), 'weights': ['uniform', 'distance'], 
#          'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
#classifier = RandomForestClassifier()
#params = {'clf__n_estimators': np.arange(50, 1001, 50), 'clf__max_depth': np.append(np.arange(1, 11), None)}
#classifier = MLPClassifier(hidden_layer_sizes=(50,50))
#params = {'activation': ['logistic', 'tanh', 'relu'], 'alpha': less_than_1}
#classifier = BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced', criterion='entropy', 
#                                                                     max_depth=9, max_features=None, 
#                                                                     splitter='best'), n_jobs=8)
#params = {'clf__n_estimators': np.arange(50, 1001, 50), 'clf__max_samples': [0.5, 0.8, 1.0],
#          'clf__max_features': [0.5, 0.8, 1.0]}
#params = {'clf__bootstrap': [True, False], 'clf__bootstrap_features': [True, False]}
#classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced', criterion='gini', 
#                                                                     max_depth=10, max_features=None, 
#                                                                     splitter='best'))
#params = {'clf__n_estimators': np.arange(50, 1001, 50), 'clf__learning_rate': less_than_1}
#classifier = GradientBoostingClassifier(max_depth=9)
model = Pipeline([
    ('scaler', MaxAbsScaler()),
    ('clf', classifier)
])

In [42]:
# Cross Validate

scores = cross_validate(model, X, y, cv=5, 
                        scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'], 
                        n_jobs=-1)
accuracy_score = scores['test_accuracy']
precision_score = scores['test_precision_weighted']
recall_score = scores['test_recall_weighted']
f1_score = scores['test_f1_weighted']

print("Accuracy score: {0:.3f} (+/- {1:.3f})".format(accuracy_score.mean(), accuracy_score.std()))
print("Precision score: {0:.3f} (+/- {1:.3f})".format(precision_score.mean(), precision_score.std()))
print("Recall score: {0:.3f} (+/- {1:.3f})".format(recall_score.mean(), recall_score.std()))
print("F1 score: {0:.3f} (+/- {1:.3f})".format(f1_score.mean(), f1_score.std()))

Accuracy score: 0.697 (+/- 0.013)
Precision score: 0.684 (+/- 0.011)
Recall score: 0.697 (+/- 0.013)
F1 score: 0.687 (+/- 0.010)


## Optimization

In [48]:
gs = GridSearchCV(model, params, cv=3, scoring='f1_weighted', iid=False, n_jobs=-1)
gs.fit(X, y)

# summarize the results of the grid search
print("Best score: ", gs.best_score_)
print("Best params: ", gs.best_params_)

# Set best params to the estimators        
model.set_params(**gs.best_params_)

# Cross validate
scores = cross_validate(model, X, y, cv=5, 
                        scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'], 
                        n_jobs=-1)
accuracy_score = scores['test_accuracy']
precision_score = scores['test_precision_weighted']
recall_score = scores['test_recall_weighted']
f1_score = scores['test_f1_weighted']

print("Accuracy score: {0:.3f} (+/- {1:.3f})".format(accuracy_score.mean(), accuracy_score.std()))
print("Precision score: {0:.3f} (+/- {1:.3f})".format(precision_score.mean(), precision_score.std()))
print("Recall score: {0:.3f} (+/- {1:.3f})".format(recall_score.mean(), recall_score.std()))
print("F1 score: {0:.3f} (+/- {1:.3f})".format(f1_score.mean(), f1_score.std()))

Best score:  0.7186917650851026
Best params:  {'clf__C': 9.0, 'clf__penalty': 'l1', 'clf__tol': 0.1}
Accuracy score: 0.728 (+/- 0.025)
Precision score: 0.706 (+/- 0.033)
Recall score: 0.728 (+/- 0.025)
F1 score: 0.704 (+/- 0.031)


  " = {}.".format(effective_n_jobs(self.n_jobs)))
