## Data preparation and EDA

In this section i clean and prepare the dataset for the model which involves the following steps:

- Download the data from the given link.
- Reformat categorical columns (status, home, marital, records, and job) by mapping with appropriate values.
- Replace the maximum value of income, assests, and debt columns with NaNs.
- Replace the NaNs in the dataframe with 0 (will be shown in the next lesson).
- Extract only those rows in the column status who are either ok or default as value.
- Split the data in a two-step process which finally leads to the distribution of 60% train, 20% validation, and 20% test sets with random seed to 11.
- Prepare target variable status by converting it from categorical to binary, where 0 represents ok and 1 represents default.
- Finally delete the target variable from the train/val/test dataframe.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [None]:
data=pd.read_csv("/Users/victoroshimua/Machine-learning-zoomcamp-/DATA/credit_risk.csv")

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.columns=data.columns.str.lower()

In [None]:
data.status.value_counts()

In [None]:
data["status"]=data.status.map({1:"ok",2:"default",0:"unk"})
data.head()

In [None]:
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

data.home = data.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

data.marital = data.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

data.records = data.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

data.job = data.job.map(job_values)


In [None]:
data.head()

In [None]:
data.describe().round()

In [None]:
### according to the data description from where i downloaded it, 999999999 represents a mising value
## To make it show.
for c in ["income","assets","debt"]:
    data[c] = data[c].replace(99999999,np.nan)


In [None]:
data.describe().round()

In [None]:
data.isnull().sum()

In [None]:
data.status.value_counts()

In [None]:
data=data[data.status != "unk"].reset_index(drop=True)

In [None]:
data.status.value_counts()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
data["status"]=(data["status"]=="default").astype(int)
data["status"]

In [None]:
data_full_train,data_test = train_test_split(data,test_size=0.2,random_state=11)
data_train,data_val=train_test_split(data_full_train,test_size=0.25,random_state=11)
len(data_train), len(data_val),len(data_test)

In [None]:
data_train=data_train.reset_index(drop=True)
data_test=data_test.reset_index(drop=True)
data_val=data_val.reset_index(drop=True)

In [None]:
data_train

In [None]:
Y_train=data_train["status"].values
Y_test=data_test["status"].values
Y_val=data_val["status"].values

In [None]:
len(Y_test),len(Y_val)

In [None]:
del data_train["status"]
del data_test["status"]
del data_val["status"]

In [None]:
data_train

## Decision trees

In [None]:
def assess_risk(client):
    if client['records'] == 'yes':
        if client['job'] == 'parttime':
            return 'default'
        else:
            return 'ok'
    else:
        if client['assets'] > 6000:
            return 'ok'
        else:
            return 'default'
# decision trees in without Scikit learn

In [None]:
dt=data_train.iloc[0].to_dict()
assess_risk(dt)

In [None]:
for i in range(2672):
    dt = data_train.iloc[i].to_dict()
    risk_level = assess_risk(dt)
    print("Risk level for client", i+1, ":", risk_level)


In [None]:
#decision tree with scikit learn
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

In [None]:
train_dicts=data_train.fillna(0).to_dict(orient="records")
dv=DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train=dv.transform(train_dicts)


In [None]:

dv.get_feature_names()

In [None]:
dt=DecisionTreeClassifier()
dt.fit(X_train,Y_train)

In [None]:
val_dicts=data_val.fillna(0).to_dict(orient="records")
X_val=dv.transform(val_dicts)

In [None]:
y_pred=dt.predict_proba(X_val)[:,1]
y_pred

In [None]:
roc_auc_score(Y_val,y_pred)
#low score here 

In [None]:
## checking for roc_auc_score of the train data
pred_1= dt.predict_proba(X_train)[:,1]
roc_auc_score(Y_train,pred_1)

## The model did learn well on the train data but failed to predict well on unseen data this is (OVERFITTING)

In [None]:
## To solve overfiting in decision trees, i have to change the number of trees or train a decision stump

In [None]:
dtm=DecisionTreeClassifier(max_depth=3)
dtm.fit(X_train,Y_train)
y_pred=dtm.predict_proba(X_val)[:,1]
y_pred

In [None]:
roc_auc_score(Y_val,y_pred)

In [None]:
pred_1= dtm.predict_proba(X_train)[:,1]
roc_auc_score(Y_train,pred_1)

In [None]:
### here the model is better and does not over fit after changing the number of dept

In [None]:
print(export_text(dt, feature_names=dv.get_feature_names()))
### Decision trees with plenty layers

In [None]:
##decision tree with only three layers
print(export_text(dtm, feature_names=dv.get_feature_names()))

### Decision trees parameter tuning

* selecting max_depth = numbers of rules 
* selecting min_samples_leaf = numbers of samples in each leaf nodes
###### These are the two best parameters for a Decision tree 

###### selecting max_depth that brings higher accuracy

In [None]:
for d in [1,2,3,4,5,6,7,10,15,20,None,100]:# setting figures for max_depth(none means any of number dept)
    dtc=DecisionTreeClassifier(max_depth=d)
    dtc.fit(X_train,Y_train)
    
    y_pred=dtc.predict_proba(X_val)[:,1]
    auc=roc_auc_score(Y_val,y_pred)
    
    print("For Max_dept {d}, auc_score = {a}".format(d=d,a=auc) )


In [None]:
## from here i can see the best parameter for max_depth is between 4 to 6,but 5 is the highest

###### selecting min_sample_leaf and max_depth that brings higher accuracy

In [None]:
scores = []

for depth in [4, 5, 6]:
    for s in [1, 5, 10, 15, 20, 500, 100, 200]:
        dt = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=s)
        dt.fit(X_train, Y_train)

        y_pred = dt.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(Y_val, y_pred)
        
        scores.append((depth, s, auc))

In [None]:
columns = ['max_depth', 'min_samples_leaf', 'auc']
df_scores = pd.DataFrame(scores, columns=columns)
df_scores

In [None]:
### to know the parameter with the higest accuracy using pivot tables
df_scores_pivot = df_scores.pivot(index='min_samples_leaf', columns=['max_depth'], values=['auc'])
df_scores_pivot.round(3)

In [None]:
import seaborn as sns
sns.heatmap(df_scores_pivot, annot=True, fmt=".3f")

In [None]:
### from the pivot table it can be seen that the parameters with the highest roc_auc score is max_depth=6 and min_samples_leaf=15

In [None]:
# using the best parameters to tune the model
dt=DecisionTreeClassifier(max_depth=6,min_samples_leaf=15)
dt.fit(X_train, Y_train)
print(export_text(dt, feature_names=dv.get_feature_names()))

### Ensemble and random forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
rf=RandomForestClassifier(n_estimators=10)

In [None]:
rf.fit(X_train,Y_train)

In [None]:
y_pred=rf.predict_proba(X_val)[:,1]

In [None]:
score=roc_auc_score(Y_val,y_pred)

In [None]:
score

#### Random forest parameter tuning
* max_depth	
* n_estimators
* min_sample_leafs

In [None]:
# using different extimators
scores = []

for n in range(10, 201, 10):
    rf = RandomForestClassifier(n_estimators=n, random_state=1)
    rf.fit(X_train, Y_train)

    y_pred = rf.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(Y_val, y_pred)
    
    scores.append((n, auc))

In [None]:
df_scores.head()

In [None]:
df_scores = pd.DataFrame(scores, columns=['n_estimators', 'auc'])
plt.plot(df_scores.n_estimators, df_scores.auc)
plt.xlabel("number of trees ")
plt.ylabel("auc")
plt.show()


In [None]:
### using different max depth
scores = []

for d in [5, 10, 15]:
    for n in range(10, 201, 10):
        rf = RandomForestClassifier(n_estimators=n,
                                    max_depth=d,
                                    random_state=1)
        rf.fit(X_train, Y_train)

        y_pred = rf.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(Y_val, y_pred)

        scores.append((d, n, auc))
columns = ['max_depth', 'n_estimators', 'auc']
df_scores = pd.DataFrame(scores, columns=columns)


In [None]:
columns = ['max_depth', 'n_estimators', 'auc']
df_scores = pd.DataFrame(scores, columns=columns)
df_scores

In [None]:
for d in [5, 10, 15]:
    df_subset = df_scores[df_scores.max_depth == d]
    
    plt.plot(df_subset.n_estimators, df_subset.auc,
             label='max_depth=%d' % d)

plt.legend()

In [None]:
### best max dept
max_depth=10

In [None]:
scores = []

for s in [1, 3, 5, 10, 50]:
    for n in range(10, 201, 10):
        rf = RandomForestClassifier(n_estimators=n,
                                    max_depth=max_depth,
                                    min_samples_leaf=s,
                                    random_state=1)
        rf.fit(X_train, Y_train)

        y_pred = rf.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(Y_val, y_pred)

        scores.append((s, n, auc))


In [None]:
columns = ['min_samples_leaf', 'n_estimators', 'auc']
df_scores = pd.DataFrame(scores, columns=columns)
df_scores.head(1)

In [None]:
colors = ['black', 'blue', 'orange', 'red', 'grey']
values = [1, 3, 5, 10, 50]

for s, col in zip(values, colors):
    df_subset = df_scores[df_scores.min_samples_leaf == s]
    
    plt.plot(df_subset.n_estimators, df_subset.auc,
             color=col,
             label='min_samples_leaf=%d' % s)

plt.legend()

In [None]:
### best min_samples_leaf
min_samples_leaf = 3


In [None]:
###using the best parameters
rf = RandomForestClassifier(n_estimators=200,
                            max_depth=max_depth,
                            min_samples_leaf=min_samples_leaf,
                            random_state=1)
rf.fit(X_train, Y_train)
RandomForestClassifier(max_depth=10, min_samples_leaf=3, n_estimators=200,
                       random_state=1)

Other useful parametes:
* max_features
* bootstrap


### Gradient boosting and Xgboost

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb

In [None]:
features=dv.get_feature_names()
d_train=xgb.DMatrix(X_train,label=Y_train,feature_names=features)

In [None]:
d_val=xgb.DMatrix(X_val,label=Y_val,feature_names=features)

In [None]:
# to know more about the parameters check the xgboost documentation 
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, d_train, num_boost_round=10)


In [None]:
y_pred=model.predict(d_val)

In [None]:
roc_auc_score(Y_val,y_pred)

In [None]:
%%capture output

In [None]:
watchlist = [(d_train, 'train'), (d_val, 'val')]

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, d_train, num_boost_round=200,
                  verbose_eval=5,
                  evals=watchlist)
s = output.stdout
print(s[:200])

###### could not continue because my jupyter notebook fails to run the capture magic funtion, i wil come bact to it later

### Selecting the final model
* After all the training i will select between decision tree,random forest or gradient boost based on thier roc score after using the best parameter tuning 
* Then train final model with the entire training dataset(df_full_train)
* save the model


In [None]:
# best decision tree
dt = DecisionTreeClassifier(max_depth=6, min_samples_leaf=15)
dt.fit(X_train, Y_train)

In [None]:
y_pred=dt.predict_proba(X_val)[:,1]
roc_auc_score(Y_val,y_pred)

In [None]:
## best random forest
rf = RandomForestClassifier(n_estimators=200,
                            max_depth=10,
                            min_samples_leaf=3,
                            random_state=1)
rf.fit(X_train, Y_train)

In [None]:
y_pred=rf.predict_proba(X_val)[:,1]
roc_auc_score(Y_val,y_pred)

In [None]:
## best Xgboost model
xgb_params = {
    'eta': 0.1, 
    'max_depth': 3,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, d_train, num_boost_round=175)


In [None]:
y_pred=model.predict(d_val)
roc_auc_score(Y_val,y_pred)

### from here i can see the best performing model is the Xgboost model

## Next i will train the full train dataset(data_full_train) with xgboost model(best model)

In [None]:
data_full_train

In [None]:
data_full_train = data_full_train.reset_index(drop=True)
y_full_train = (data_full_train.status == 'default').astype(int).values
del data_full_train['status']
## reset index and prepare the features and target

In [None]:
dicts_full_train = data_full_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)
## feature matrix(one hot encoding )

In [None]:
dicts_test = data_test.to_dict(orient='records')
X_test = dv.transform(dicts_test)

In [None]:
dfulltrain = xgb.DMatrix(X_full_train, label=y_full_train,
                    feature_names=dv.get_feature_names())

In [None]:
dtest = xgb.DMatrix(X_test, feature_names=dv.get_feature_names())


In [None]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 3,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}



In [None]:
model = xgb.train(xgb_params, dfulltrain, num_boost_round=175)


In [None]:
yy_pred = model.predict(dtest)
roc_auc_score(Y_test, yy_pred)