# Import libraries

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#####
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
#####
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
#####
from sklearn import metrics
#####
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
####
import warnings

# Import the data and make data frame

In [None]:
# import the data
data = pd.read_csv('/kaggle/input/bank-personal-loan-modelling/Bank_Personal_Loan_Modelling.csv')
# make data Frame
df = pd.DataFrame(data)
df

# Machin cannot understand the format of CCAvg so we chang that to float type

In [None]:
df['CCAvg'] = df['CCAvg'].astype(str).str.replace('/', '.')
df

# Experience cannot be negative so, we change that to abs of itself  

In [None]:
df.Experience = abs(df.Experience)
df

# Check the amount of missing value

In [None]:
df.isnull().sum()

well, the data haven't any missing value

# Correlation

In [None]:
df.corr()

# Correlation plot

In [None]:
sns.clustermap(df.corr(),annot=True, fmt=".1f")

# Check the amount of data in each column (feature)

In [None]:
featuresAndTarget = ['Age','Experience','Income','Family','CCAvg','Education','Mortgage','Personal Loan','Securities Account'
                     ,'CD Account' ,'Online' , 'CreditCard' ]
features = ['Age',
            'Experience',
            'Income',
            'Family',
            'CCAvg',
            'Education',
            'Mortgage',
            'Securities Account',
            'CD Account',
            'Online',
            'CreditCard']

target = 'Personal Loan'

fig, ax = plt.subplots(nrows=6, ncols=2, figsize=(15,15), dpi=100)

for i in range(len(features)):
    x = i//2
    y = i%2
    sns.countplot(x=features[i] , data=df , ax=ax[x,y])
    ax[x,y].set_xlabel(features[i], size = 12)
    ax[x,y].set_title('{} vs. {}'.format(target, features[i]), size = 15)

plt.tight_layout()

# Drow scatterplot

In [None]:
fig, ax = plt.subplots(nrows=6, ncols=2, figsize=(15,15), dpi=100)

for i,var in enumerate(features):
    x = i//2
    y = i%2
    ax[x,y].set_xlabel(features[i], size = 12)
    ax[x,y].set_title('{} vs. {}'.format(target, features[i]), size = 15)
    sns.scatterplot(data=df, x=var , y=df['Personal Loan'], ax=ax[x,y])
plt.tight_layout()

## categrical features

In [None]:
categrical = ['Age','Experience', 'Income', 'Family']

In [None]:
i=0
while i<4:   
    
    plt.subplot(1,2,1)
    sns.boxplot(x=categrical[i] , data=df)
    i += 1
    
    plt.subplot(1,2,2)
    sns.boxplot(x=categrical[i] , data=df)
    i += 1
    
    plt.show()

# We have noise in Income feature and we should remove those 
### Noises find by box plot in upper cell

In [None]:
df1 = df[df['Income']>160].index
df = df.drop(df1)
sns.boxplot(x='Income' , data=df)

# Make predictor model

### we will this with 3 algorithm

### 1- LogisticRegression
### 2- GaussianNB
### 3- KNeighborsClassifier (KNN)
### 4-ComplementNB

### And in the ende we use the algorithm that have the best score

# We first prepare the preparations


## first we apart features and target and use train_test_split  then we will make a class for evaluation 

In [None]:
#Features
x = df.drop('Personal Loan' , axis=1) 
#Target
y = df['Personal Loan']
#Prepare the train data and test data and the size for testing
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.3 , random_state=0)

## Shape of x_train , x_test & y_train , y_test

In [None]:
print('x_train shape : ' , x_train.shape)
print('x_test shape : ' , x_test.shape)
print('y_train shape : ' , y_train.shape)
print('y_test shape : ' , y_test.shape)

## make a classes foe evaluation method

## 1- Make LogisticRegression model

### do scaling for x_train and x_test

In [None]:
Scaler = StandardScaler()
x_train_scaled = Scaler.fit_transform(x_train)
x_test_scaled = Scaler.transform(x_test)

In [None]:
LR = LogisticRegression()
LR.fit(x_train_scaled,y_train)
y_pred = LR.predict(x_test_scaled)
LR_score = metrics.accuracy_score(y_test,y_pred)
print('accuracy_score : ' , LR_score)

## 2- Make KNN model

In [None]:
k=50
acc = np.zeros((k))
for i in range(1,k+1):
    Knn = KNeighborsClassifier(n_neighbors=i)
    Knn.fit(x_train_scaled,y_train)
    y_pred = Knn.predict(x_test_scaled)
    acc[i-1] = metrics.accuracy_score(y_test,y_pred)
Knn_score = np.max(acc)
print('accuracy_score' , Knn_score)

## 3-Make GaussianNB model

In [None]:
G_NB = GaussianNB()
G_NB.fit(x_train , y_train)
y_pred = G_NB.predict(x_test)
G_NB_score = metrics.accuracy_score(y_test,y_pred)
print('accuracy_score' , G_NB_score)

## 4-ComplementNB

In [None]:
scaler = MinMaxScaler()
RX_train = scaler.fit_transform(x_train)
RX_test = scaler.transform(x_test)

In [None]:
CMP = ComplementNB()
CMP.fit(RX_train , y_train)
y_pred = CMP.predict(RX_test)
CMP_score = metrics.accuracy_score(y_test,y_pred)
print('accuracy_score' , CMP_score)

# Compare the models

In [None]:
print('LogisticRegression: ',LR_score ,'\n', 'KNN: ',Knn_score ,'\n', 'GaussianNB: ',G_NB_score ,'\n', 'ComplementNB: ',CMP_score)

# Do prediction with a data out of dataframe
### first we make a dataframe with the data we need do prediction on that
### finaly we do prediction with the 4 model 

## Make predict dataframe

In [None]:
predict_df = pd.DataFrame({'ID':[5071],
                          'Age':[42],
                          'Experience':[16],
                          'Income':[30],
                          'ZIP Code':[92037],
                          'Family':[3],
                          'CCAvg':[1.2],
                          'Education':[3],
                          'Mortgage':[0],
                          'Securities Account':[1],
                          'CD Account':[0],
                          'Online':[1],
                          'CreditCard':[1]})

predict_df_scaled = Scaler.fit_transform(predict_df)  #Make scaled the dataframe

## Do prediction

In [None]:
print('predict with LogisticRegression ( The best model ): ',LR.predict(predict_df_scaled))
print('predict with KNN : ',Knn.predict(predict_df_scaled))
print('predict with GaussianNB : ',G_NB.predict(predict_df))
print('predict with ComplementNB : ',CMP.predict(predict_df))