## Modeling

### Data Cleaning and Transformation

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_df = pd.read_csv('credit.csv')
# Removing the unnecessary columns
data_df = data_df.drop('Unnamed: 0',axis=1)  
# This column can be used to estimate the  geography and other relevant details.
# But in current form, It will not add much.
data_df.drop('telephone', axis=1, inplace=True)

In [3]:
# Identification of types and missing values of columns
for col in data_df.columns:
    print(f"{col} \t{data_df[col].dtype} \t{data_df[col].isnull().sum()}")
#print(data_df.dtypes)

checking_balance 	float64 	394
months_loan_duration 	int64 	0
credit_history 	object 	0
purpose 	object 	0
amount 	int64 	0
savings_balance 	float64 	183
employment_length 	object 	62
installment_rate 	int64 	0
personal_status 	object 	310
other_debtors 	object 	0
residence_history 	object 	130
property 	object 	0
age 	int64 	0
installment_plan 	object 	0
housing 	object 	0
existing_credits 	int64 	0
default 	int64 	0
dependents 	int64 	0
foreign_worker 	object 	0
job 	object 	0
gender 	object 	0


In [4]:
# will use this method to clean "employment_length" Col of Data frame
def year_to_int ( mystr):
    mystr = str(mystr)
    year_split = mystr.split(" ")
    if (not year_split[0].startswith('nan')):
        years = int(year_split[0])
    else:
        years =0    
    return years 

# will use this method to clean the features of Data frame
# "Residence_history" may be in months. so converting all entries to month can be beneficial.
def month_to_int (mystr):
    mystr = str(mystr)
    if 'nan' in mystr:
        months = 0      #missing is treated as zero.
    elif 'month' in mystr:
        year_split = mystr.split(" ")
        months = int(year_split[0])
    else:
        year_split = mystr.split(" ")
        months = int(year_split[0]) * 12
    return months

In [5]:
data_df.head(5)

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_length,installment_rate,personal_status,other_debtors,...,property,age,installment_plan,housing,existing_credits,default,dependents,foreign_worker,job,gender
0,-43.0,6,critical,radio/tv,1169,,13 years,4,single,none,...,real estate,67,none,own,2,0,1,yes,skilled employee,male
1,75.0,48,repaid,radio/tv,5951,89.0,2 years,2,,none,...,real estate,22,none,own,1,1,1,yes,skilled employee,female
2,,12,critical,education,2096,24.0,5 years,2,single,none,...,real estate,49,none,own,1,0,2,yes,unskilled resident,male
3,-32.0,42,repaid,furniture,7882,9.0,5 years,2,single,guarantor,...,building society savings,45,none,for free,1,0,2,yes,skilled employee,male
4,-23.0,24,delayed,car (new),4870,43.0,3 years,3,single,none,...,unknown/none,53,none,for free,2,1,2,yes,skilled employee,male


In [8]:
# Fill the missing values based on class means of "default" col values
# Missing values for default = mean of 'checking balance' of default
# Missing values for non-default = mean of 'checking balance' of non-default
chk_bal_class_means = data_df.groupby('default')['checking_balance'].mean()
sav_bal_class_means = data_df.groupby('default')['savings_balance'].mean()
data_df['checking_balance'].fillna(value=data_df['default'].map(chk_bal_class_means), inplace=True) 
data_df['savings_balance'].fillna(value=data_df['default'].map(sav_bal_class_means), inplace=True)
# apply aforementioned methods for cleaning
data_df['personal_status'].fillna(value='missing', inplace=True)
data_df['employment_length'] = data_df['employment_length'].apply(year_to_int)
data_df['residence_history'] = data_df['residence_history'].apply(month_to_int)

In [10]:
category_cols = ['credit_history','purpose', 'personal_status','property','other_debtors','property','installment_plan','housing','foreign_worker','job','gender']

In [12]:
from sklearn.preprocessing import LabelEncoder
# Encode category cols to numeric for use in classification
def convert_cols_to_numeric (df, cols):
    for col in cols:
        le= LabelEncoder()
        df[col] = le.fit_transform(df[col])
    return df

In [13]:
# Encode category cols to numeric for use in classification
numeric_df = convert_cols_to_numeric(data_df, category_cols)

In [14]:
numeric_df.head(5)

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_length,installment_rate,personal_status,other_debtors,...,property,age,installment_plan,housing,existing_credits,default,dependents,foreign_worker,job,gender
0,-43.0,6,0,7,1169,986.612022,13,4,3,2,...,2,67,1,1,2,0,1,1,1,1
1,75.0,48,4,7,5951,89.0,2,2,2,2,...,2,22,1,1,1,1,1,1,1,0
2,123.167614,12,0,4,2096,24.0,5,2,3,2,...,2,49,1,1,1,0,2,1,3,1
3,-32.0,42,4,5,7882,9.0,5,2,3,1,...,0,45,1,0,1,0,2,1,1,1
4,-23.0,24,1,1,4870,43.0,3,3,3,2,...,3,53,1,0,2,1,2,1,1,1


In [15]:
# identify the unique values in category cols.
# This info will be used to decide whether to apply z-score normalization or not on the col
uniques = numeric_df[category_cols].apply(lambda x: x.nunique(), axis=0)
print(uniques)

credit_history       5
purpose             10
personal_status      4
property             4
other_debtors        3
property             4
installment_plan     3
housing              3
foreign_worker       2
job                  4
gender               2
dtype: int64


In [17]:
numeric_cols_list = [col for col in numeric_df.columns if col not in category_cols]
print(numeric_cols_list)

['checking_balance', 'months_loan_duration', 'amount', 'savings_balance', 'employment_length', 'installment_rate', 'residence_history', 'age', 'existing_credits', 'default', 'dependents']


In [19]:
# identify the cols which require z-score normalization.
to_remove = ['dependents','default','existing_credits','installment_rate']
scalable_cols = [item for item in numeric_cols_list if item not in to_remove]
print(scalable_cols)

['checking_balance', 'months_loan_duration', 'amount', 'savings_balance', 'employment_length', 'residence_history', 'age']


In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled = scaler.fit_transform(numeric_df[scalable_cols])
numeric_df[scalable_cols] = scaled

In [23]:
# For separting data and target
x_col_list = list(filter(lambda x: not x.startswith('default'),numeric_df.columns))
print(len(x_col_list))

20


In [24]:
from sklearn.model_selection import train_test_split
Y = numeric_df['default'].values
X = numeric_df[x_col_list].values

x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size= 0.1, random_state=47)
print(f"x_train: {x_train.shape}, x_test: {x_test.shape}, y_train: {y_train.shape}, y_test:{y_test.shape}")

x_train: (900, 20), x_test: (100, 20), y_train: (900,), y_test:(100,)


### Model Building

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

model = RandomForestClassifier ()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc}")

Accuracy: 0.79


In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
clf_name_list = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Grad Boost']
clf_list = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(), GradientBoostingClassifier() ]

In [31]:
from time import time
for clf, name in zip(clf_list, clf_name_list):
    tic = time()
    clf.fit(x_train, y_train)
    print(f'{name} training took {time()-tic}')
    y_pred = clf.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print(f"Accuracy of {name}: {accuracy:.2f}")
    print("*"*30)

Logistic Regression training took 0.01800847053527832
Accuracy of Logistic Regression: 0.81
******************************
Decision Tree training took 0.0060007572174072266
Accuracy of Decision Tree: 0.70
******************************
Random Forest training took 0.29697155952453613
Accuracy of Random Forest: 0.80
******************************
Grad Boost training took 0.28000760078430176
Accuracy of Grad Boost: 0.80
******************************
