In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

# from sklearn.preprocessing import LabelEncoder,OneHotEncoder #this is optional

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from scipy.stats import zscore

from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,GridSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression,Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix,r2_score,roc_auc_score,classification_report,mean_squared_error,accuracy_score


import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('D:/datasets+minipro/Vidhya k SLC/Bank_Personal_Loan_Modelling.csv')

In [3]:
df.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [4]:
df.drop('ID',axis=1,inplace=True)

df.drop('ZIP Code',axis=1,inplace=True)

# Separating X & y:

In [5]:
X=df.drop('Personal Loan',axis=1)
y=df[['Personal Loan']]

# Splitting using train test:

In [6]:
#Splitting X&y using train_test:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=8)

# Logistic Regression:

In [7]:
#logreg = LogisticRegression(penalty="l1",class_weight="balanced",random_state=8) # logistic regression using lasso
logreg = LogisticRegression(penalty="l2",class_weight="balanced",random_state=8) # logistic regression using Ridge,by default ridge is used
logreg.fit(X_train,y_train)
y_tr_pred=logreg.predict(X_train)
y_test_pred=logreg.predict(X_test)
y_prob = logreg.predict_proba(X_test)
print(pd.DataFrame(y_prob[:,1]).head())  #### head of probability predicted

# training data & testing data accuracy:
print('roc_auc_score on training data: ',roc_auc_score(y_train,y_tr_pred)) #train
print('roc_auc_score on testing data: ',roc_auc_score(y_test,y_test_pred)) #test

          0
0  0.172164
1  0.164152
2  0.086702
3  0.014368
4  0.048675
roc_auc_score on training data:  0.8925367526184774
roc_auc_score on testing data:  0.8970209478021977


#### Evaluation of Logistic regression using metrics:

In [8]:
# confusion matrix:
cm0 = confusion_matrix(y_test, y_test_pred)
print('Confusion matrix:')
print(cm0)
print('----------------------------------------------')

#Classification report of precision,accuracy,recall:
#optional #cr=classification_report(y_test, y_pred)
#optional#print(cr)
print('classification report:')
print(classification_report(y_test,y_test_pred))
print('----------------------------------------------')

from sklearn.metrics import log_loss
print('log loss:')
print(log_loss(y_test,y_prob))

Confusion matrix:
[[789 107]
 [  9  95]]
----------------------------------------------
classification report:
              precision    recall  f1-score   support

           0       0.99      0.88      0.93       896
           1       0.47      0.91      0.62       104

    accuracy                           0.88      1000
   macro avg       0.73      0.90      0.78      1000
weighted avg       0.93      0.88      0.90      1000

----------------------------------------------
log loss:
0.28202695584985393


# Decision Tree classifier:

In [9]:
# Fitting the decision tree with default hyperparameters, apart from
# max_depth which is 5 so that we can plot and read the tree.
dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train, y_train)
y_pred=dt.predict(X_test)

#### Evaluation of Decision Tree using metrics:

In [10]:
cm1=confusion_matrix(y_test, y_pred)

In [11]:
# dt.score(y_test,y_pred)

print('classification report:')
print(classification_report(y_test, y_pred))
print('----------------------------------------------')
# Printing confusion matrix and accuracy
print('Confusion matrix:')
print(cm1)
print('----------------------------------------------')
print('accuracy score:')
print(accuracy_score(y_test,y_pred))

classification report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       896
           1       0.97      0.84      0.90       104

    accuracy                           0.98      1000
   macro avg       0.97      0.92      0.94      1000
weighted avg       0.98      0.98      0.98      1000

----------------------------------------------
Confusion matrix:
[[893   3]
 [ 17  87]]
----------------------------------------------
accuracy score:
0.98


# KNN classifier:

In [12]:
''' 
-requires large memory for storing the entire training dataset for prediction. 
 KNN requires ***scaling of data[imp]***** because KNN uses the Euclidean distance between two
 data points to find nearest neighbors.
-Euclidean distance is sensitive to magnitudes. 
-The features with high magnitudes will weight more than features with low magnitudes.
-KNN also not suitable for large dimensional data.
'''

'''
-The training phase of K-nearest neighbor classification is much faster compared to 
 other classification algorithms. There is no need to train a model for generalization,
-That is why KNN is known as the simple and instance-based learning algorithm.
-KNN can be useful in case of nonlinear data. It can be used with the regression problem. 
 Output value for the object is computed by the average of k closest neighbors value.
'''



from sklearn.neighbors import KNeighborsClassifier

In [41]:
l=[]
for i in range(1,11,1):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print('num of neighbours',i,'is: ',accuracy_score(y_test, y_pred))
    l.append(accuracy_score(y_test, y_pred))
print('Max score by changing value of k: ',max(l))


# changing to misclassification error
MSE = [1 - x for x in l]
print('minimum value of mean square error: ',min(MSE))
optimal_k = [MSE.index(min(MSE))]
print('optimum value for k is at index:',optimal_k)

num of neighbours 1 is:  0.902
num of neighbours 2 is:  0.913
num of neighbours 3 is:  0.907
num of neighbours 4 is:  0.909
num of neighbours 5 is:  0.903
num of neighbours 6 is:  0.905
num of neighbours 7 is:  0.905
num of neighbours 8 is:  0.907
num of neighbours 9 is:  0.905
num of neighbours 10 is:  0.901
Max score by changing value of k:  0.913
minimum value of mean square error:  0.08699999999999997
optimum value for k is at index: [1]


In [14]:
    knn = KNeighborsClassifier(n_neighbors = 2)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print('num of neighbours' ,'2 accuracy ''is: ',accuracy_score(y_test, y_pred))

num of neighbours 2 accuracy is:  0.913


# Random Forest Classifier:

In [15]:
from sklearn.ensemble import RandomForestClassifier
rfc= RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc.score(X_test,y_test)

0.99

In [16]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [17]:
### Important features
feat_imp_df=pd.DataFrame({'features':X_train.columns,'importance':rfc.feature_importances_})
feat_imp_df.sort_values('importance',ascending=False)

Unnamed: 0,features,importance
2,Income,0.324735
5,Education,0.193552
4,CCAvg,0.17608
3,Family,0.111726
8,CD Account,0.056558
1,Experience,0.040973
0,Age,0.038709
6,Mortgage,0.036158
10,CreditCard,0.008542
9,Online,0.007959


In [18]:
n_estimators=range(100,700,100)
criterion=['gini','entropy']
hyper={'n_estimators':n_estimators,
      'criterion':criterion}

### Grid search CV for Random forest classifier:

gd=GridSearchCV(estimator=RandomForestClassifier(random_state=8),param_grid=hyper,verbose=True)
gd.fit(X_train,y_train)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  1.6min finished


0.9880000000000001
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=8, verbose=0,
                       warm_start=False)


# Bagging classifier:

### 1-Bagging classifier with knn and using K fold cross validation:

In [19]:
from sklearn.ensemble import BaggingClassifier
bgcl_knn=BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=3),random_state=8,
                           n_estimators=200)
bgcl_knn.fit(X_train,y_train)
prediction0=bgcl_knn.predict(X_test)
print('The accuracy for bagged KNN is:',accuracy_score(y_test,prediction0))
result=cross_val_score(bgcl_knn,X,y,cv=10,scoring='accuracy')
print('The cross validated score for bagged KNN is:',result.mean())

The accuracy for bagged KNN is: 0.906
The cross validated score for bagged KNN is: 0.9128000000000001


### 2-Bagging classifier with Decision Tree and using K fold cross validation:

In [20]:
## Even if we dont mention anything to base_estimator,default is decisiontreeclassifier

bgcl_dt=BaggingClassifier(base_estimator=DecisionTreeClassifier(),random_state=8
                        ,n_estimators=100)
bgcl_dt.fit(X_train,y_train)
prediction1=bgcl_dt.predict(X_test)
print('The accuracy for bagged Decision Tree is:',accuracy_score(prediction1,y_test))
result=cross_val_score(bgcl_dt,X,y,cv=10,scoring='accuracy')
print('The cross validated score for bagged Decision Tree is:',result.mean())

The accuracy for bagged Decision Tree is: 0.989
The cross validated score for bagged Decision Tree is: 0.9870000000000001


### 3-Bagging classifier with Random Forest and using K fold cross validation:

In [21]:
#### this code takes a lot of time to run computationally #### 
bgcl_rfc=BaggingClassifier(base_estimator=RandomForestClassifier(),random_state=8 ,n_estimators=50)
bgcl_rfc.fit(X_train,y_train)
prediction2=bgcl_rfc.predict(X_test)
print('The accuracy for bagged Decision Tree is:',accuracy_score(prediction2,y_test))

### Warning!!!!!!!!!!!!!!!!!!!!! ########
#the below mentioned 2 lines of code takes a lot of time to run,so avoid running k fold/gridsearch CV for rfc
# result=cross_val_score(bgcl_rfc,X,y,cv=10,scoring='accuracy')
# print('The cross validated score for bagged Decision Tree is:',result.mean())

The accuracy for bagged Decision Tree is: 0.986


### 4-Bagging classifier with Naivebayes[gaussian] and using K fold cross validation:

In [49]:
from sklearn.naive_bayes import GaussianNB

In [51]:
bgcl_nb=BaggingClassifier(base_estimator=GaussianNB(),random_state=8,n_estimators=51)
bgcl_nb.fit(X_train,y_train)
prediction3=bgcl_nb.predict(X_test)
print('The accuracy for Naive bayes is:',accuracy_score(prediction3,y_test))
# print('The accuracy for bagged Decision Tree is:',accuracy_score(prediction1,y_test))
# result=cross_val_score(bgcl_nb,X,y,cv=10,scoring='accuracy')
# print('The cross validated score for bagged Decision Tree is:',result.mean())

The accuracy for Naive bayes is: 0.873


#### 5-Bagging classifier with Logistic Regression and using K fold cross validation:

In [52]:
bgcl_LR=BaggingClassifier(base_estimator=LogisticRegression(),random_state=8,n_estimators=50)
bgcl_LR.fit(X_train,y_train)
prediction4=bgcl_LR.predict(X_test)
print('The accuracy for bagged Logistic Regression is:',accuracy_score(prediction4,y_test))
# print('The accuracy for bagged Decision Tree is:',accuracy_score(prediction1,y_test))
# result=cross_val_score(bgcl_LR,X,y,cv=10,scoring='accuracy')
# print('The cross validated score for bagged Logistic Regression is:',result.mean())

The accuracy for bagged Logistic Regression is: 0.986


# Boosting models:

### 1- Adaptive boosting/Adaboost :

In [48]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(random_state=1)
abc.fit(X_train, y_train)
print('score wrto xtest,ytest:',abc.score(X_test,y_test))
y_pred=abc.predict(X_test)
print('score wrto ytest,ypred:',accuracy_score(y_test,y_pred)) ## actual measure of how accurate is model!

score wrto xtest,ytest: 0.968
score wrto ytest,ypred: 0.968


#### Evaluation using cross validation:

In [23]:
###### cross validation of ada boost classifier using ***k fold*** ########
from sklearn.ensemble import AdaBoostClassifier
ada=AdaBoostClassifier(n_estimators=200,random_state=0,learning_rate=0.1)
result=cross_val_score(ada,X,y,cv=10,scoring='accuracy')
print('result of all validations: ',result)
print('The cross validated score for AdaBoost is:',result.mean())

result of all validations:  [0.968 0.978 0.964 0.97  0.966 0.972 0.968 0.974 0.962 0.968]
The cross validated score for AdaBoost is: 0.969


In [24]:
### Warning ########
#the below mentioned lines of code takes a lot of time to run!!!!!!!!!!!!!!!!!!!!!1

###### cross validation of ada boost classifier using grid search CV ########[Use this at ur own risk!]
'''
n_estimators=list(range(100,500,100))
learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
hyper={'n_estimators':n_estimators,'learning_rate':learn_rate}
gd=GridSearchCV(estimator=AdaBoostClassifier(),param_grid=hyper,verbose=True)
gd.fit(X_train,y_train)
print(gd.best_score_)
print(gd.best_estimator_)
'''

"\nn_estimators=list(range(100,500,100))\nlearn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]\nhyper={'n_estimators':n_estimators,'learning_rate':learn_rate}\ngd=GridSearchCV(estimator=AdaBoostClassifier(),param_grid=hyper,verbose=True)\ngd.fit(X_train,y_train)\nprint(gd.best_score_)\nprint(gd.best_estimator_)\n"

### 2- Gradient Boosting : We cant apply any other model than decision tree [it uses dt by default]

##### Warning :    We only use dt by default,no base_estimator argument in gradient boosting

In [25]:
from sklearn.ensemble import GradientBoostingClassifier
gbc= GradientBoostingClassifier(learning_rate=0.01,random_state=8)
#learning rate should be between 0 to 1,good sp is 0.01 or 0.1
gbc.fit(X_train, y_train)
gbc.score(X_test,y_test)

0.979

#### evaluation using cross validation:

In [26]:
###### cross validation of ada boost classifier using ***k fold*** ########
grad=GradientBoostingClassifier(n_estimators=100,random_state=8,learning_rate=0.1)
result=cross_val_score(grad,X,y,cv=10,scoring='accuracy')
print('result of all validations: ',result)
print('The cross validated score for Gradient Boosting is:',result.mean())

result of all validations:  [0.988 0.984 0.988 0.986 0.984 0.982 0.988 0.99  0.988 0.984]
The cross validated score for Gradient Boosting is: 0.9862


In [27]:
### Warning ########
#the below mentioned lines of code takes a lot of time to run!!!!!!!!!!!!!!!!!!!!!1

###### cross validation of ada boost classifier using grid search CV ########[Use this at ur own risk!]
'''
n_estimators=list(range(100,500,100))
learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
hyper={'n_estimators':n_estimators,'learning_rate':learn_rate}
gd=GridSearchCV(estimator=GradientBoostingClassifier(),param_grid=hyper,verbose=True)
gd.fit(X_train,y_train)
print(gd.best_score_)
print(gd.best_estimator_)
'''

"\nn_estimators=list(range(100,500,100))\nlearn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]\nhyper={'n_estimators':n_estimators,'learning_rate':learn_rate}\ngd=GridSearchCV(estimator=GradientBoostingClassifier(),param_grid=hyper,verbose=True)\ngd.fit(X_train,y_train)\nprint(gd.best_score_)\nprint(gd.best_estimator_)\n"

## statistical model:

In [None]:
# X=df.drop('Personal Loan',axis=1)
# y=df[['Personal Loan']]


# import statsmodels.api as sm
# X = sm.add_constant(X)
# model = sm.OLS(y, X).fit()
# model.summary()

# Unsupervised learning:

### 1-Hierarchial Model: