## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import warnings

#### Import the CSV Data as Pandas DataFrame

In [2]:
df=pd.read_csv('data/german_credit_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [4]:
df=df.rename(columns={'Unnamed: 0':'Id'})

In [5]:
df.head()

Unnamed: 0,Id,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [6]:
df=df.fillna({'Saving accounts':df['Saving accounts'].mode()[0],'Checking account':df['Checking account'].mode()[0]})

In [7]:
df.isna().sum()

Id                  0
Age                 0
Sex                 0
Job                 0
Housing             0
Saving accounts     0
Checking account    0
Credit amount       0
Duration            0
Purpose             0
Risk                0
dtype: int64

In [8]:
df.duplicated().sum()

0

In [9]:
df.columns

Index(['Id', 'Age', 'Sex', 'Job', 'Housing', 'Saving accounts',
       'Checking account', 'Credit amount', 'Duration', 'Purpose', 'Risk'],
      dtype='object')

In [10]:
print("Categories in 'Sex' variable:     ",end=" " )
print(df['Sex'].unique())

print("Categories in 'Housing' variable:  ",end=" ")
print(df['Housing'].unique())

print("Categories in 'Saving accounts' variable:",end=" " )
print(df['Saving accounts'].unique())

print("Categories in 'Checking account' variable:     ",end=" " )
print(df['Checking account'].unique())

print("Categories in 'Purpose' variable:     ",end=" " )
print(df['Purpose'].unique())

print("Categories in 'Risk' variable:     ",end=" " )
print(df['Risk'].unique())

Categories in 'Sex' variable:      ['male' 'female']
Categories in 'Housing' variable:   ['own' 'free' 'rent']
Categories in 'Saving accounts' variable: ['little' 'quite rich' 'rich' 'moderate']
Categories in 'Checking account' variable:      ['little' 'moderate' 'rich']
Categories in 'Purpose' variable:      ['radio/TV' 'education' 'furniture/equipment' 'car' 'business'
 'domestic appliances' 'repairs' 'vacation/others']
Categories in 'Risk' variable:      ['good' 'bad']


In [11]:
# define numerical & categorical columns
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 5 numerical features : ['Id', 'Age', 'Job', 'Credit amount', 'Duration']

We have 6 categorical features : ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose', 'Risk']


#### Preparing X and Y variables

In [12]:
X = df.drop(columns=['Risk'],axis=1)

In [13]:
X.head()

Unnamed: 0,Id,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0,67,male,2,own,little,little,1169,6,radio/TV
1,1,22,female,2,own,little,moderate,5951,48,radio/TV
2,2,49,male,1,own,little,little,2096,12,education
3,3,45,male,2,free,little,little,7882,42,furniture/equipment
4,4,53,male,2,free,little,little,4870,24,car


In [14]:
df['Risk'] = df['Risk'].map({'bad': 0, 'good': 1})

In [15]:
y = df['Risk']

In [16]:
y

0      1
1      0
2      1
3      1
4      0
      ..
995    1
996    1
997    1
998    0
999    1
Name: Risk, Length: 1000, dtype: int64

In [17]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [18]:
X = preprocessor.fit_transform(X)

In [19]:
X.shape

(1000, 25)

In [20]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((800, 25), (200, 25))

#### Create an Evaluate Function to give all metrics after model Training

In [21]:
def evaluate_model(true, predicted):
    accuracy_s = accuracy_score(true, predicted)
    confusion_matrx = confusion_matrix(true, predicted)
    classification_repo = classification_report(true, predicted)
    return accuracy_s, confusion_matrx , classification_repo

In [22]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}
model_list = []
accuracy_score_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_accuracy_s , model_train_confusion_matrx, model_train_classification_repo = evaluate_model(y_train, y_train_pred)

    model_test_accuracy_s , model_test_confusion_matrx, model_test_classification_repo  = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy score: {:.4f}".format(model_train_accuracy_s))
    print("- Confusion matrix: " , model_train_confusion_matrx)
    print("- Classification report: " , model_train_classification_repo)

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy score: {:.4f}".format(model_test_accuracy_s))
    print("- Confusion matrix: ", model_test_confusion_matrx)
    print("- Classification report: ", model_test_classification_repo)
    accuracy_score_list.append(model_test_accuracy_s)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- Accuracy score: 0.7238
- Confusion matrix:  [[ 62 179]
 [ 42 517]]
- Classification report:                precision    recall  f1-score   support

           0       0.60      0.26      0.36       241
           1       0.74      0.92      0.82       559

    accuracy                           0.72       800
   macro avg       0.67      0.59      0.59       800
weighted avg       0.70      0.72      0.68       800

----------------------------------
Model performance for Test set
- Accuracy score: 0.7100
- Confusion matrix:  [[ 13  46]
 [ 12 129]]
- Classification report:                precision    recall  f1-score   support

           0       0.52      0.22      0.31        59
           1       0.74      0.91      0.82       141

    accuracy                           0.71       200
   macro avg       0.63      0.57      0.56       200
weighted avg       0.67      0.71      0.67       200





K-Neighbors Classifier
Model performance for Training set
- Accuracy score: 0.7700
- Confusion matrix:  [[105 136]
 [ 48 511]]
- Classification report:                precision    recall  f1-score   support

           0       0.69      0.44      0.53       241
           1       0.79      0.91      0.85       559

    accuracy                           0.77       800
   macro avg       0.74      0.67      0.69       800
weighted avg       0.76      0.77      0.75       800

----------------------------------
Model performance for Test set
- Accuracy score: 0.6500
- Confusion matrix:  [[ 11  48]
 [ 22 119]]
- Classification report:                precision    recall  f1-score   support

           0       0.33      0.19      0.24        59
           1       0.71      0.84      0.77       141

    accuracy                           0.65       200
   macro avg       0.52      0.52      0.51       200
weighted avg       0.60      0.65      0.62       200



Decision Tree
Model performanc

  if is_sparse(data):


CatBoosting Classifier
Model performance for Training set
- Accuracy score: 0.9175
- Confusion matrix:  [[176  65]
 [  1 558]]
- Classification report:                precision    recall  f1-score   support

           0       0.99      0.73      0.84       241
           1       0.90      1.00      0.94       559

    accuracy                           0.92       800
   macro avg       0.95      0.86      0.89       800
weighted avg       0.93      0.92      0.91       800

----------------------------------
Model performance for Test set
- Accuracy score: 0.7000
- Confusion matrix:  [[ 11  48]
 [ 12 129]]
- Classification report:                precision    recall  f1-score   support

           0       0.48      0.19      0.27        59
           1       0.73      0.91      0.81       141

    accuracy                           0.70       200
   macro avg       0.60      0.55      0.54       200
weighted avg       0.65      0.70      0.65       200



AdaBoost Classifier
Model perf

## Results

In [23]:
pd.DataFrame(list(zip(model_list, accuracy_score_list)), columns=['Model Name', 'Accuracy_Score']).sort_values(by=["Accuracy_Score"],ascending=False)

Unnamed: 0,Model Name,Accuracy_Score
0,Logistic Regression,0.71
3,Random Forest Classifier,0.71
6,AdaBoost Classifier,0.71
5,CatBoosting Classifier,0.7
4,XGBClassifier,0.67
1,K-Neighbors Classifier,0.65
2,Decision Tree,0.61


## Logistic Regression

In [24]:
log_reg_model = LogisticRegression()
log_reg_model = log_reg_model.fit(X_train, y_train)
y_pred = log_reg_model.predict(X_test)
score = accuracy_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 71.00


#### Difference between Actual and Predicted Values

In [25]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
521,0,1,-1
737,1,1,0
740,1,1,0
660,1,1,0
411,1,1,0
...,...,...,...
408,1,1,0
332,0,0,0
208,1,1,0
613,1,0,1
