# Predicting Mortgage Backed Securities Prepayment Risk Prediction 

## Importing Data & Needed Libraries

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import networkx as nx
import statsmodels.api as sm
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
import matplotlib
warnings.filterwarnings("always")

%matplotlib inline 

In [2]:
data = pd.read_csv('LoanExport.csv')
data.head(20)

  data = pd.read_csv('LoanExport.csv')


Unnamed: 0,CreditScore,FirstPaymentDate,FirstTimeHomebuyer,MaturityDate,MSA,MIP,Units,Occupancy,OCLTV,DTI,...,PostalCode,LoanSeqNum,LoanPurpose,OrigLoanTerm,NumBorrowers,SellerName,ServicerName,EverDelinquent,MonthsDelinquent,MonthsInRepayment
0,0,199902,N,202901,16974,25,1,O,89,27,...,60400,F199Q1268030,P,360,2,FL,WASHINGTONMUTUALBANK,0,0,52
1,0,199902,N,202901,19740,0,1,O,73,17,...,80200,F199Q1015092,N,360,1,FT,CHASEHOMEFINANCELLC,0,0,144
2,0,199902,N,202901,29940,0,1,O,75,16,...,66000,F199Q1266886,N,360,2,FL,WASHINGTONMUTUALBANK,0,0,67
3,0,199902,N,202901,31084,0,1,O,76,14,...,90700,F199Q1178167,N,360,2,GM,GMACMTGECORP,0,0,35
4,0,199902,N,202901,35644,0,1,O,78,18,...,7600,F199Q1178517,N,360,2,GM,GMACMTGECORP,0,0,54
5,0,199902,N,202901,X,25,1,O,89,40,...,80400,F199Q1224802,N,360,2,Ot,Other servicers,0,0,42
6,0,199902,N,202901,X,25,1,O,90,21,...,53100,F199Q1291436,P,360,1,RE,Other servicers,0,0,44
7,0,199902,X,202901,36740,0,1,O,72,20,...,32800,F199Q1001824,N,360,1,NO,WELLSFARGOBANKNA,1,3,145
8,0,199902,Y,202710,25540,30,1,O,95,38,...,6400,F199Q1079744,P,345,1,Ot,Other servicers,0,0,64
9,0,199902,Y,202901,48620,30,1,O,95,27,...,67000,F199Q1196318,P,360,1,Ot,WELLSFARGOBANKNA,1,104,212


## Defining Needed Functions for EDA

In [3]:
def draw_histogram(col,plottitle,xlabel,ylabel):
  plt.figure(figsize=(30,10))
  plt.hist(data[col],edgecolor='black')
  plt.title(plottitle)
  plt.xlabel(xlabel)
  plt.ylabel(ylabel)
  plt.show()
# function to draw histogram to know the density of any categorical data

In [4]:
def draw_boxplot(col):
  sns.boxplot(data[col]) 


In [5]:
def draw_piechart(col,label):
  plt.figure(figsize=(10,10))
  plt.pie(data[col].value_counts(),shadow=True ,autopct='%.3f',labels=label)
  plt.show()

In [6]:
def draw_scatterplot(col1,col2):
  plt.scatter(data[col1],data[col2])


In [7]:
def calc_linearsummary(feature, target):
  y = data[target]
  x = data[feature]
  x = sm.add_constant(x)
  model = sm.OLS(y, x).fit()
  print(model.summary())

In [8]:
def draw_join_plot(colx,coly):
  sns.jointplot(x=colx,y=coly,kind='reg')

In [9]:
def handle_outlier(col):
  sorted(col)
  Q1,Q3=col.quantile([0.25,0.75])
  IQR=Q3-Q1
  lower=Q1-(1.5*IQR)
  upper=Q3+(1.5*IQR)
  return lower , upper

In [10]:
def label_encoding(col):
  label_encoder = preprocessing.LabelEncoder()
  data[col]= label_encoder.fit_transform(data[col])


In [11]:
def one_hot_encoding(cols):
  one_hot_encoded_data = pd.get_dummies(data, columns = cols)
  return one_hot_encoded_data

## Data Preprocessing & EDA

In [12]:
data.shape

(291451, 28)

In [13]:
# Checking informations about our features
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291451 entries, 0 to 291450
Data columns (total 28 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   CreditScore         291451 non-null  int64  
 1   FirstPaymentDate    291451 non-null  int64  
 2   FirstTimeHomebuyer  291451 non-null  object 
 3   MaturityDate        291451 non-null  int64  
 4   MSA                 291451 non-null  object 
 5   MIP                 291451 non-null  int64  
 6   Units               291451 non-null  int64  
 7   Occupancy           291451 non-null  object 
 8   OCLTV               291451 non-null  int64  
 9   DTI                 291451 non-null  int64  
 10  OrigUPB             291451 non-null  int64  
 11  LTV                 291451 non-null  int64  
 12  OrigInterestRate    291451 non-null  float64
 13  Channel             291451 non-null  object 
 14  PPM                 291451 non-null  object 
 15  ProductType         291451 non-nul

In [14]:
# Checking for missing values 
data.isnull().sum()

CreditScore               0
FirstPaymentDate          0
FirstTimeHomebuyer        0
MaturityDate              0
MSA                       0
MIP                       0
Units                     0
Occupancy                 0
OCLTV                     0
DTI                       0
OrigUPB                   0
LTV                       0
OrigInterestRate          0
Channel                   0
PPM                       0
ProductType               0
PropertyState             0
PropertyType              0
PostalCode                0
LoanSeqNum                0
LoanPurpose               0
OrigLoanTerm              0
NumBorrowers              0
SellerName            24994
ServicerName              0
EverDelinquent            0
MonthsDelinquent          0
MonthsInRepayment         0
dtype: int64

In [15]:
#Check if there's any categorical features in our data 
data.dtypes

CreditScore             int64
FirstPaymentDate        int64
FirstTimeHomebuyer     object
MaturityDate            int64
MSA                    object
MIP                     int64
Units                   int64
Occupancy              object
OCLTV                   int64
DTI                     int64
OrigUPB                 int64
LTV                     int64
OrigInterestRate      float64
Channel                object
PPM                    object
ProductType            object
PropertyState          object
PropertyType           object
PostalCode             object
LoanSeqNum             object
LoanPurpose            object
OrigLoanTerm            int64
NumBorrowers           object
SellerName             object
ServicerName           object
EverDelinquent          int64
MonthsDelinquent        int64
MonthsInRepayment       int64
dtype: object

In [16]:
# Convert date feature to datetime format 
data['MaturityDate'] = data['MaturityDate'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m'))

In [None]:
data['FirstPaymentDate'] = data['FirstPaymentDate'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m'))

In [None]:
#Visualize the change on dtypes of features
data.info()

In [None]:
#View description of all of our features 
num1_cols = data.describe(include='all')
num1_cols

In [None]:
#Count number of distinct elements in axis 0.
data.nunique()

In [None]:
data["EverDelinquent"].value_counts()

In [None]:
print("Seller Name size: \n",data.SellerName.value_counts(),"\n\n")

In [None]:
#Return unique values based on a hash table.
data['FirstTimeHomebuyer'].unique()

In [None]:
# X means No
# Y means Yes
# X means Unknown value 

In [None]:
data['PPM'].unique()

In [None]:
data['NumBorrowers'].unique()
# X  means unknown value

In [None]:
#draw piechart of NumBorrowers to see the distribution of its values 
label_arr=['2','1','x']
draw_piechart('NumBorrowers',label_arr)

In [None]:
#draw piechart of FirstTimeHomebuyer to see the distribution of its values 

label_arr=['NO','YES','x']
draw_piechart('FirstTimeHomebuyer',label_arr)

In [None]:
#draw piechart of PPM to see the distribution of its values 
label_arr=['NO','YES','x']
draw_piechart('PPM',label_arr)

In [None]:
data['Occupancy'].unique()

In [None]:
#draw piechart of Occupancy to see the distribution of its values 
label_arr=['O','I','S']
draw_piechart('Occupancy',label_arr)

In [None]:
data['PostalCode'].value_counts()

In [None]:
data['LoanPurpose'].unique()

In [None]:
data['LoanPurpose'].value_counts()

In [None]:
#draw piechart of LoanPurpose to see the distribution of its values 

label_arr=['P','N','C']
draw_piechart('LoanPurpose',label_arr)

In [None]:
data['Channel'].unique()

In [None]:
# Draw histogram to know the density of out categorical data: Channel
draw_histogram('Channel','Channel graph','Channel','frequency')

In [None]:
#Convert categorial feature to numerical one by label_encoding function defined earlier
label_encoding('Channel')

x is unknown neither it is yes or no so we will drop x rows 

In [None]:
data['PropertyState'].unique()

In [None]:
# Draw histogram to know the density of out categorical data: PropertyState
draw_histogram('PropertyState','PropertyState graph','PropertyState','frequency')

In [None]:
#Convert it to numerical 
label_encoding('PropertyState')

In [None]:
data['PropertyType'].unique()

In [None]:
# Draw histogram to know the density of out categorical data: PropertyType

draw_histogram('PropertyType','PropertyType graph','PropertyType','frequency')

In [None]:
#Convert it to numerical 
label_encoding('PropertyType')

## Data Visualization

In [None]:
plt.figure(figsize=(30,20))
sns.countplot(x='ServicerName',data=data,palette="twilight_shifted")
plt.xlabel("ServicerName",fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(35,35))
sns.countplot(x='PropertyState',data=data,palette="autumn")
plt.xlabel("PropertyState",fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.barplot('OrigLoanTerm','MonthsInRepayment',data=data,palette='gist_rainbow')
plt.xlabel('Original Loan Rate')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(27,10))
sns.barplot('OCLTV','MonthsInRepayment',data=data,palette='autumn')
plt.xlabel('OCLTV')
plt.legend()
plt.show()

In [None]:
#convert LoanPurpose and FirstTimeHomebuyer to numerical data by get_dummies function
encoded_data = pd.get_dummies(data['LoanPurpose'], prefix='LoanPurpose')
encoded_data2 = pd.get_dummies(data['FirstTimeHomebuyer'], prefix='FirstTimeHomebuyer')
data_temp = pd.concat([data, encoded_data], axis=1)
data=pd.concat([data_temp,encoded_data2], axis =1)
data.drop('LoanPurpose', axis=1,inplace=True)
data.drop('FirstTimeHomebuyer', axis =1 , inplace=True)

## Quick Feature Engineering 

In [None]:
data.columns

In [None]:
# Eliminating useless features 
data.drop(['FirstTimeHomebuyer_X','FirstTimeHomebuyer_N','ServicerName','SellerName','NumBorrowers','PropertyType','PropertyState','ProductType','PPM','Channel','Occupancy','MSA','MaturityDate','FirstPaymentDate'], inplace=True,axis=1)


In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
#check nulls precentage to be more sure 
data.isnull().sum()*100/data.shape[0] 

In [None]:
data.drop(['PostalCode','LoanSeqNum'], axis=1,inplace=True)

In [None]:
data.info()
#no more categorical variables 

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.corr()
#draw the correlation matrix to see the relationship between variables

In [None]:
#sns.heatmap(data.corr(),cmap='coolwarm',annot=True)
#show the correlation between data's features using heat map


# Generate correlation matrix
corr_matrix = data.corr()

# Set figure size
plt.figure(figsize=(10,7))

# Create heatmap with correlation matrix
sns.heatmap(corr_matrix, cmap='coolwarm', annot=True)

# Show plot
plt.show()


In [None]:
sns.heatmap(data.corr(),cmap='rocket')

## Handling Outliers

In [None]:
sns.boxplot(data=data,palette='rainbow',orient='h')
#box plot to all features to show outliers

In [None]:
#view individual boxplots to visualise outliers

In [None]:
draw_boxplot('OrigUPB')

In [None]:
#drop outliers
upp,low=handle_outlier(data['OrigUPB'])

In [None]:
data['OrigUPB']=np.where(data['OrigUPB']>upp,upp,data['OrigUPB'])
data['OrigUPB']=np.where(data['OrigUPB']<low,low,data['OrigUPB'])

In [None]:
draw_boxplot('Units')

In [None]:
upper,lower=handle_outlier(data['Units'])

In [None]:
data['Units']=np.where(data['Units']>upper,upper,data['Units'])
data['Units']=np.where(data['Units']<lower,lower,data['Units'])

In [None]:
draw_boxplot('OrigInterestRate')

In [None]:
uppero,lowero=handle_outlier(data['OrigInterestRate'])

In [None]:
data['OrigInterestRate']=np.where(data['OrigInterestRate']>uppero,uppero,data['OrigInterestRate'])
data['OrigInterestRate']=np.where(data['OrigInterestRate']<lowero,lowero,data['OrigInterestRate'])

In [None]:
#checking for duplicates values to drop them
duplicate=data.duplicated()
print(duplicate.sum())

In [None]:
data=data.drop_duplicates()

In [None]:
duplicate=data.duplicated()
print(duplicate.sum())

## New Data Visualisation 

In [None]:
plt.figure(figsize=(20,16))
sns.lineplot(data=data)

In [None]:
draw_join_plot(data['MonthsInRepayment'],data['OrigUPB'])

In [None]:
data['EverDelinquent'].unique()

In [None]:
data['EverDelinquent'].value_counts()

In [None]:
draw_histogram('EverDelinquent','EverDelinquent graph','answer','frequency')

In [None]:
labels=['NO','YES']
draw_piechart('EverDelinquent',labels)

In [None]:
ax = sns.distplot(data,
                  kde=False,
                  color='blue')
ax.set(xlabel='data Distribution', ylabel='Frequency')
 
plt.show()

In [None]:
import scipy.stats as stats
ax = sns.distplot(data,
                  bins=50,
                  kde=True,
                  color='red',
                  hist_kws={"linewidth": 15,'alpha':1})
ax.set(xlabel='Normal Distribution', ylabel='Frequency')
 
plt.show()

In [None]:
#calculating z score to know how far from mean value (by calculating standard deviation)
#help you to know more about data
from scipy import stats
z_score=np.abs(stats.zscore(data))
print (z_score)

In [None]:
# view Standard Normal Distribution for all features 
z_score.hist(color='slategray',figsize=(30,20))
plt.title("Standard Normal Distribution", y=1.015, fontsize=22)
plt.xlabel("z-score", labelpad=14)
plt.ylabel("frequency", labelpad=14);


## Feature Extraction with PCA

In [None]:
import plotly.express as px
from sklearn.decomposition import PCA


n_components = 5

pca = PCA(n_components=n_components)
components = pca.fit_transform(data)

total_var = pca.explained_variance_ratio_.sum() * 100

labels = {str(i): f"PC {i+1}" for i in range(n_components)}
labels['color'] = 'Median Price'

fig = px.scatter_matrix(
    components,
    color=data['EverDelinquent'],
    dimensions=range(n_components),
    labels=labels,
    title=f'Total Explained Variance: {total_var:.2f}%',
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
#that make a distribution of the data become clear

In [None]:
from pandas_profiling import ProfileReport

In [None]:
#Quick Recap of EDA 
profile=ProfileReport(data)
profile


## Feature Engineering 

In [None]:
data.head()

In [None]:
import pandas as pd

def calculate_credit_range(row):
    if row['CreditScore'] >= 750:
        return 'excellent'
    elif row['CreditScore'] >= 700:
        return 'good'
    elif row['CreditScore'] >= 650:
        return 'fair'
    else:
        return 'poor'
    
def calculate_ltv_range(row):
    if row['LTV'] < 75:
        return 'low'
    elif row['LTV'] >= 75 and row['LTV'] < 80:
        return 'medium'
    elif row['LTV'] >= 80 and row['LTV'] < 90:
        return 'high'
    else:
        return 'very high'
    
def calculate_repay_range(row):
    if row['OrigInterestRate'] < 4:
        return 'low'
    elif row['OrigInterestRate'] >= 4 and row['OrigInterestRate'] < 6:
        return 'medium'
    else:
        return 'high'

data['CreditRange'] = data.apply(calculate_credit_range, axis=1)
data['LTVRange'] = data.apply(calculate_ltv_range, axis=1)
data['RepayRange'] = data.apply(calculate_repay_range, axis=1)


# convert categorical ranges to numerical features
credit_range_dummies = pd.get_dummies(data['CreditRange'], prefix='CreditRange')
label_encoding('LTVRange')
repay_range_dummies = pd.get_dummies(data['RepayRange'], prefix='RepayRange')

# concatenate the newly created numerical features to the original data
raw_data = pd.concat([data, credit_range_dummies, repay_range_dummies], axis=1)

# drop the original categorical ranges
raw_data = raw_data.drop(['CreditRange', 'RepayRange'], axis=1)


In [None]:
raw_data.isnull().sum()

In [None]:
data.drop(["MonthsInRepayment",'MonthsDelinquent'],axis=1,inplace=True)
raw_data.drop(["MonthsInRepayment",'MonthsDelinquent'],axis=1,inplace=True)
#because they are strongly corrolated with the target and causes data leakage (over fitting)

In [None]:
#Splitting data to features and label data 

In [None]:
raw_data.columns   

In [None]:
x=raw_data.drop(['EverDelinquent'], axis=1)
y=raw_data['EverDelinquent']

In [None]:
x.columns

In [None]:
from sklearn.decomposition import PCA
pca = PCA().fit(x)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()


In [None]:
#Scalling feature data 

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
stat_data = scaler.fit_transform(x)


In [None]:
x.isnull().sum()

In [None]:
plt.hist(x)
plt.show()


In [None]:
!pip install category_encoders
#to install the needed liberary 

In [None]:
x.corr()

In [None]:
x.isnull().sum()

  ## Mutual Information

In [None]:
# Select the most important features 

In [None]:
import category_encoders as ce
from sklearn.feature_selection import SelectKBest , SelectPercentile , mutual_info_classif

In [None]:
from sklearn.feature_selection import mutual_info_classif as MIC
mi_score = MIC(x,y)
print(mi_score)

In [None]:
from sklearn.feature_selection import chi2
best_features=SelectKBest(score_func=chi2,k=10)
fit=best_features.fit(x,y)
df_scores= pd.DataFrame(fit.scores_)
df_col=pd.DataFrame(x.columns)
features_score =pd.concat([df_col,df_scores], axis=1)
features_score.columns=['feature','score']
features_score.sort_values(by=['score'],ascending=False)

In [None]:
selector=SelectKBest(mutual_info_classif,k=15)
x_selected=selector.fit_transform(x,y)
cols=selector.get_support(indices=True)
selected_features = x.iloc[:,cols].columns.tolist()
selected_features

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model=ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_)

In [None]:
#view feature importance

In [None]:
feat_imp=pd.Series(model.feature_importances_,index=x.columns)
feat_imp.nlargest(20).plot(kind='barh')
plt.show()

In [None]:
from sklearn.feature_selection import mutual_info_regression
discrete_features = x.dtypes == int
def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(x, y)
mi_scores[::] 

In [None]:
raw_data.columns

In [None]:
#Select best features after PCA

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression


# Define the number of top features to select
N = 11

# Select top N features based on mutual information
selector = SelectKBest(score_func=mutual_info_regression, k=N)
X_selected = selector.fit_transform(x, y)

# Get the indices of the selected features
selected_feature_indices = selector.get_support(indices=True)

# Get the names of the selected features
selected_features = x.columns[selected_feature_indices].tolist()



In [None]:
df_selected = pd.DataFrame(X_selected)

df_selected.dtypes

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x= scaler.fit_transform(x)


In [None]:
pca=PCA(n_components=6)
pca.fit(x)
features=pca.transform(x)

## Modelling

In [None]:
from sklearn.model_selection import train_test_split


# separate features and labels
import pandas as pd
df_selected = pd.DataFrame(X_selected)
# features = df_selected

df_labels = pd.DataFrame(y)
labels = df_labels["EverDelinquent"]


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
    test_size=0.25, random_state= 8) 


print("X_train shape: {}".format(X_train.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_train shape: {}".format(y_train.shape))
print("y_test shape: {}".format(y_test.shape))
print("X_val shape: {}".format(y_train.shape))
print("y val shape: {}".format(y_test.shape))


In [None]:
features.shape

In [None]:
labels.shape

In [None]:
labels.value_counts()

In [None]:
from imblearn.over_sampling import SMOTE
oversampled = SMOTE(random_state=0)
X_train_smote, y_train_smote = oversampled.fit_resample(X_train, y_train)

In [None]:
y_train_smote.value_counts()

## Random Forest Model 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



# Train the random forest classifier
rf = RandomForestClassifier(n_estimators=2)
rf.fit(X_train_smote, y_train_smote)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
#Testing Accuracy
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred_test = rf.predict(X_test)
y_pred_test[0:5]


testing_accuracy = accuracy_score(y_test,y_pred_test)*100
print('Testing data accuracy is:', testing_accuracy)
print()

clf_report = classification_report(y_test,y_pred_test)
print('Classification report:\n', clf_report)
print()

confusion_matrix(y_test,y_pred_test)

In [None]:
#Training Accuracy

y_pred_train = rf.predict(X_train)
y_pred_train


training_accuracy = accuracy_score(y_train,y_pred_train)*100
print('training data accuracy is:', training_accuracy)
print()

clf_report = classification_report(y_train,y_pred_train)
print('Classification report:\n', clf_report)
print()

confusion_matrix(y_train,y_pred_train)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred)

## SVC Model (Support Vector Machine)

In [None]:
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
model=LinearSVC()
model.fit(X_train_smote, y_train_smote)
train_pred=model.predict(X_train)
y_preds=model.predict(X_test)

print(f"Training score = {metrics.accuracy_score(y_train,train_pred)}")
print(f"Testing score = {metrics.accuracy_score(y_test,y_preds)}")


In [None]:


training_accuracy = accuracy_score(y_train,train_pred)*100
print('training data accuracy is:', training_accuracy)
print()

clf_report = classification_report(y_train,train_pred)
print('Classification report:\n', clf_report)
print()

confusion_matrix(y_train,train_pred)

In [None]:
confusion_matrix(y_test,y_preds)

In [None]:
accuracy_score(y_test,y_preds)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_preds)

SVC works well with the data but this is not the best accuracy 

## Decision tree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model_d= DecisionTreeClassifier(criterion="entropy",max_depth=10,random_state=44,ccp_alpha=0.8,min_impurity_decrease=0.6)

# Train Decision Tree Classifer
model_d = model_d.fit(X_train_smote, y_train_smote)
y_train_pred=model_d.predict(X_train)
#Predict the response for test dataset
y_pred_test = model_d.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_train,y_train_pred))

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test,y_pred_test))

In [None]:

training_accuracy = accuracy_score(y_train,y_train_pred)*100
print('training data accuracy is:', training_accuracy)
print()

clf_report2 = classification_report(y_train,y_train_pred)
print('Classification report:\n', clf_report2)
print()


In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred_test)

decision tree make the highest accuracy than the previous two algorithms 

we should choose the model which has the highest auc curve and more sharp after test so we chose random forest to deploy 

#Deployment 


In [None]:
import joblib
joblib.dump(rf,'model.joblib')
#save the random forest model using joblib

In [None]:
# save the pca using joblib
joblib.dump(pca,'pca.joblib')

In [None]:
#save the scaler object to do normalization to user input 
dump(scaler,'scaler.joblib')