### Problem Statement:

__About Company__ <br>
Dream Housing Finance company deals in all home loans. They have presence across all urban, semi urban and rural areas. Customer first apply for home loan after that company validates the customer eligibility for loan.

__Problem__ <br>
Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have given a problem to identify the customers segments, those are eligible for loan amount so that they can specifically target these customers. Here they have provided a partial data set.

#### Dataset Description:

| Variable | Description | 
|------|------|
| Loan_ID | Unique Loan ID | 
| Gender | Male/ Female | 
| Married | Applicant married (Y/N) | 
| Dependents | Number of dependents | 
| Education | Applicant Education (Graduate/ Under Graduate) | 
| Self_Employed | Self employed (Y/N) | 
| ApplicantIncome | Applicant income | 
| CoapplicantIncome | Coapplicant income | 
| LoanAmount | Loan amount in thousands | 
| Loan_Amount_Term | Term of loan in months | 
| Credit_History | credit history meets guidelines | 
| Property_Area | Urban/ Semi Urban/ Rural | 
| Loan_Status | Loan approved (Y/N) | 

#### Import dependencies and load data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns #importing seaborn module 
import warnings
import os, joblib, operator
from sklearn.model_selection import train_test_split,KFold, cross_val_score, cross_val_predict,cross_validate
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from minio import Minio
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from mlflow.tracking import MlflowClient
import mlflow, urllib3
from mlflow import pyfunc

warnings.filterwarnings('ignore')  #this will ignore the warnings.it wont display warnings in notebook
plt.style.use('ggplot')
plt.rcParams['figure.figsize']=[6,3]
plt.rcParams['figure.dpi']=80

os.environ["BUCKET"] = "loan-prediction"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://mip-bdcs-vm28.mip.storage.hpecorp.net:10021"
os.environ["AWS_ACCESS_KEY_ID"] = "admin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "admin123"
os.environ["MLFLOW_TRACKING_URI"] = "https://mip-bdcs-vm28.mip.storage.hpecorp.net:10022"
os.environ["MLFLOW_TRACKING_INSECURE_TLS"] = "true"
os.environ["MLFLOW_S3_IGNORE_TLS"] = "true"

In [None]:
#Adding package to env
conda_env = mlflow.sklearn.get_default_conda_env()
conda_env['dependencies'] += ['dill']

EXPERIMENT_NAME = "Loan-Approval-Prediction"
mlflow.set_experiment(EXPERIMENT_NAME)
EXPERIMENT_ID = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

client = Minio(
    endpoint=os.getenv("MLFLOW_S3_ENDPOINT_URL").replace("https://",""),
    access_key=os.getenv("AWS_ACCESS_KEY_ID"),
    secret_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    secure=True,
    http_client = urllib3.PoolManager(cert_reqs='CERT_NONE')
)



train_obj = client.get_object(os.getenv("BUCKET"),"train.csv")
train_df = pd.read_csv(train_obj)

test_obj = client.get_object(os.getenv("BUCKET"),"test.csv")
test_df = pd.read_csv(test_obj)


In [None]:
#pre-processing
#Dropping unwanted columns - Loan_ID
train_df.drop(['Loan_ID'],axis=1,inplace=True)
test_df.drop(['Loan_ID'],axis=1,inplace=True)

#Convert target variable to integer
train_df['Loan_Status'] = train_df['Loan_Status'].map({'N':0,'Y':1})

#### Let's explore the data

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.sample(5)

Check distribution of target variable

In [None]:
train_df.Loan_Status.value_counts()

**Missing data**

Let's see here how much data is missing. We will have to fill the missing features later on.

In [None]:
def get_missing_data(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent_1 = df.isnull().sum()/df.isnull().count()*100
    percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
    missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
    return missing_data

##### Training Data

In [None]:
get_missing_data(train_df)

##### Testing Data

In [None]:
get_missing_data(test_df)

### EDA

In [None]:
#Univariate Analysis
fig,ax = plt.subplots(2,4,figsize=(16,10))
sns.countplot('Loan_Status',data=train_df,ax=ax[0][0])
sns.countplot('Gender',data=train_df,ax=ax[0][1])
sns.countplot('Married',data=train_df,ax=ax[0][2])
sns.countplot('Education',data=train_df,ax=ax[0][3])
sns.countplot('Self_Employed',data=train_df,ax=ax[1][0])
sns.countplot('Property_Area',data=train_df,ax=ax[1][1])
sns.countplot('Credit_History',data=train_df,ax=ax[1][2])
sns.countplot('Dependents',data=train_df,ax=ax[1][3])

In [None]:
# Explore variable 'Married' with target variable - 'Loan_Status'

f,ax=plt.subplots(1,2,figsize=(8,6))
train_df['Loan_Status'].value_counts().plot.pie(ax=ax[0],explode=[0,0.1],shadow=True,autopct='%1.1f%%')
ax[0].set_title('Loan_Status',fontsize=30)
ax[0].set_ylabel('Count')
sns.set(font="Verdana")
sns.set_style("ticks")
sns.countplot('Loan_Status',hue='Married',linewidth=2.5,edgecolor=".2",data=train_df,ax=ax[1])
plt.ioff()

In [None]:
#Catplot - 'Married' Vs 'Loan_Status' 
sns.catplot(x='Married',y='Loan_Status',kind='point',data=train_df)

#If you are married, then chances of getting loan approved are more (71.8%) than if you are not married (62.9%).

In [None]:
#Catplot - 'Gender' Vs 'Loan_Status' with hue='Education' and columns = 'Property_Area'
sns.catplot(x='Gender',y='Loan_Status',kind='bar',data=train_df,col='Property_Area',hue='Education')

In [None]:
# Variable - 'Education' Vs 'Loan_Status'
f,ax=plt.subplots(1,2,figsize=(10,8))
train_df['Education'].value_counts().plot.pie(ax=ax[0],explode=[0,0.1],shadow=True,autopct='%1.1f%%')
ax[0].set_title('Education',fontsize=30)
ax[0].set_ylabel('Count')
sns.set(font="Verdana")
sns.set_style("ticks")
sns.countplot('Education',hue='Loan_Status',linewidth=2.5,edgecolor=".2",data=train_df,ax=ax[1])
plt.ioff()

In [None]:
#Catplot - 'Education' Vs 'Loan_Status'
sns.catplot(x='Education',y='Loan_Status',kind='bar',data=train_df,col='Married',hue='Property_Area')

#If you are in urban+Not graduate+not married = Loan approval is less

In [None]:
# Variable - 'Property_Area' Vs 'Loan_Status'
f,ax=plt.subplots(1,2,figsize=(8,6))
train_df['Property_Area'].value_counts().plot.pie(ax=ax[0],explode=[0,0,0.1],shadow=True,autopct='%1.1f%%')
ax[0].set_title('Property_Area',fontsize=30)
ax[0].set_ylabel('Count')
sns.set(font="Verdana")
sns.set_style("ticks")
sns.countplot('Property_Area',hue='Loan_Status',linewidth=2.5,edgecolor=".2",data=train_df,ax=ax[1])
plt.ioff()

In [None]:
#Catplot - 'Property_Area' Vs 'Loan_Status'
sns.catplot(x='Property_Area',y='Loan_Status',kind='bar',data=train_df)

#If you are in semiurban, then chances of getting loan approved are more (76.8%) than if you are in urban(65.8%) & Rural (61.5%)

In [None]:
# Variable - 'Gender'  Vs 'Loan_Status'
f,ax=plt.subplots(1,2,figsize=(10,6))
train_df['Gender'].value_counts().plot.pie(ax=ax[0],explode=[0,0.1],shadow=True,autopct='%1.1f%%')
ax[0].set_title('Gender',fontsize=30)
ax[0].set_ylabel('Count')
sns.set(font="Verdana")
sns.set_style("ticks")
sns.countplot('Gender',hue='Loan_Status',linewidth=2.5,edgecolor=".2",data=train_df,ax=ax[1])
plt.ioff()

In [None]:
# Variable - 'Gender'  Vs 'ApplicantIncome' & hue = Loan_Status, col = Proprty_Area
sns.catplot(x='Gender',y='ApplicantIncome',data=train_df,kind='boxen',hue='Loan_Status', col='Property_Area')

In [None]:
# Variable - 'Gender'  Vs 'CoapplicantIncome'
sns.catplot(x='Gender',y='CoapplicantIncome',data=train_df,kind='box')

#Mean CoapplicantIncome of male slightly higer than Females.

In [None]:
# Variable - 'Gender'  Vs 'CoapplicantIncome' with hue=Loan_Status and col=Property_Area
sns.catplot(x='Gender',y='CoapplicantIncome',data=train_df,kind='boxen',hue='Loan_Status', col='Property_Area')

#Male have higher co-applicant income than female in all three property areas

In [None]:
# Variable - 'Dependents' Vs 'Loan_Status'
f,ax=plt.subplots(1,2,figsize=(8,6))
train_df['Dependents'].value_counts().plot.pie(ax=ax[0],shadow=True,autopct='%1.1f%%')
ax[0].set_title('Dependents',fontsize=30)
ax[0].set_ylabel('Count')
sns.set(font="Verdana")
sns.set_style("ticks")
sns.countplot('Dependents',hue='Loan_Status',linewidth=2.5,edgecolor=".2",data=train_df,ax=ax[1])
plt.ioff()

In [None]:
# Variable - 'Credit_History' Vs 'Loan_Status'
f,ax=plt.subplots(1,2,figsize=(8,6))
train_df['Credit_History'].value_counts().plot.pie(ax=ax[0],shadow=True,autopct='%1.1f%%')
ax[0].set_title('Credit_History',fontsize=30)
ax[0].set_ylabel('Count')
sns.set(font="Verdana")
sns.set_style("ticks")
sns.countplot('Credit_History',hue='Loan_Status',linewidth=2.5,edgecolor=".2",data=train_df,ax=ax[1])
plt.ioff()

In [None]:
# Variable - 'Credit_History' Vs 'Loan_Status'
sns.catplot(x='Credit_History',y='Loan_Status',kind='bar',data=train_df)

#If credit history is 1 then high chances (79.6%) of getting loan approved than 7.9% for credit history = 0

In [None]:
# Variable - 'Gender' Vs 'LoanAmount'
sns.catplot(x='Gender',y='LoanAmount',data=train_df,kind='box')

#Mean LoanAmount of male slightly higer than Females.

In [None]:
# Variable - 'Self_Employed' Vs 'LoanAmount'
sns.catplot(x='Self_Employed',y='LoanAmount',data=train_df,kind='box')

#If you are self employed then loan amount is higher

In [None]:
# Variable - 'Gender' Vs 'LoanAmount', hue='Loan_Status', col='Married'
sns.catplot(x='Gender',y='LoanAmount',data=train_df,kind='box',hue='Loan_Status', col='Married')

#If you are married then loan amount is slightly higher then non-married

In [None]:
# Variable - 'Loan_Amount_Term' Vs 'Loan_Status'
sns.countplot('Loan_Amount_Term',hue='Loan_Status',linewidth=2.5,edgecolor=".2",data=train_df)

#Maximum customers went for 360 months

### DATA PROCESSING

#### Imputing missing values

In [None]:
#Column - Married. Fill Null values with mode of Married column
train_df['Married'].value_counts().index[0]
train_df['Married'].fillna(train_df['Married'].value_counts().index[0], inplace=True)
test_df['Married'].fillna(test_df['Married'].value_counts().index[0], inplace=True)

#Column - Dependents. 
#If loan status is 1 then, dependent = 2 else dependent=1 for train 
train_df.loc[(train_df.Dependents.isnull())&(train_df.Loan_Status==1),'Dependents'] = '2'
train_df.loc[(train_df.Dependents.isnull()),'Dependents'] = '1'
#If Credit_History is 1 then, dependent = 2 else dependent=1 for test
test_df.loc[(test_df.Dependents.isnull())&(test_df.Credit_History==1),'Dependents'] = '2'
test_df.loc[(test_df.Dependents.isnull()),'Dependents'] = '1'

#Column - Credit_History
# If loan status is 1 then, Credit_History = 1 else Credit_History=0
train_df.loc[(train_df.Credit_History.isnull())&(train_df.Loan_Status==1),'Credit_History'] = 1
train_df.loc[(train_df.Credit_History.isnull()),'Credit_History'] = 0
# Fill Null values with mode of Credit_History column for test
test_df['Credit_History'].fillna(test_df['Credit_History'].value_counts().index[0], inplace=True)
#In test data, for the user with income = 2733, it was decided to impute credit history as 0 based upon the Income to loan ratio 
test_df.loc[(test_df.ApplicantIncome == 2733),'Credit_History']  = 0

#Column - Gender
# Fill Null values with mode of Gender column
train_df['Gender'].fillna(train_df['Gender'].value_counts().index[0], inplace=True)
test_df['Gender'].fillna(test_df['Gender'].value_counts().index[0], inplace=True)

#Column - Self_Employed
# If Credit_History is 1 then, impute Self_Employed = No else Yes
train_df.loc[(train_df.Self_Employed.isnull())&(train_df.Credit_History==1),'Self_Employed'] ='No'
train_df.loc[(train_df.Self_Employed.isnull()),'Self_Employed'] = 'Yes'
# If Credit_History is 1 then, impute Self_Employed = No else Yes for test
test_df.loc[(test_df.Self_Employed.isnull())&(test_df.Credit_History==1),'Self_Employed'] ='No'
test_df.loc[(test_df.Self_Employed.isnull()),'Self_Employed'] = 'Yes'

#Column - Loan_Amount_Term. 
#Loan_Amount_Term depends upon gender, married, education, self employed and dependent columns. 
#Hence we will group them by above columns and imput median values. In case if the median is null then we will impute median of the entire Loan_Amount_Term column
#get the index of the null columns for Loan_Amount_Term - train
index_NaN_Loan_Amount_Term = list(train_df["Loan_Amount_Term"][train_df["Loan_Amount_Term"].isnull()].index)

for i in index_NaN_Loan_Amount_Term :
    Loan_Amount_Term_med = train_df["Loan_Amount_Term"].median() # find median of entire Loan_Amount_Term column
    Loan_Amount_Term_pred = train_df["Loan_Amount_Term"][((train_df['Gender'] == train_df.iloc[i]["Gender"])
                                                      & (train_df['Married'] == train_df.iloc[i]["Married"])
                                                      & (train_df['Education'] == train_df.iloc[i]["Education"])
                                                      & (train_df['Self_Employed'] == train_df.iloc[i]["Self_Employed"])
                                                      & (train_df['Dependents'] == train_df.iloc[i]["Dependents"]))].median()
    if not np.isnan(Loan_Amount_Term_pred) :
        train_df['Loan_Amount_Term'].iloc[i] = Loan_Amount_Term_pred
    else :
        train_df['Loan_Amount_Term'].iloc[i] = Loan_Amount_Term_med

#Impute Loan term  amount for test
index_NaN_Loan_Amount_Term_test = list(test_df["Loan_Amount_Term"][test_df["Loan_Amount_Term"].isnull()].index)

for i in index_NaN_Loan_Amount_Term_test :
    Loan_Amount_Term_med = test_df["Loan_Amount_Term"].median() 
    Loan_Amount_Term_pred = test_df["Loan_Amount_Term"][(( test_df['Gender'] == test_df.iloc[i]["Gender"])
                                                         & (test_df['Married'] == test_df.iloc[i]["Married"])
                                                         & (test_df['Education'] == test_df.iloc[i]["Education"])
                                                         & (test_df['Self_Employed'] == test_df.iloc[i]["Self_Employed"])
                                                         & (test_df['Dependents'] == test_df.iloc[i]["Dependents"]))].median()
    if not np.isnan(Loan_Amount_Term_pred) :
        test_df['Loan_Amount_Term'].iloc[i] = Loan_Amount_Term_pred
    else :
        test_df['Loan_Amount_Term'].iloc[i] = Loan_Amount_Term_med

In [None]:
# Variable - LoanAmount - Identify on which other columns LoanAmount depends
# Explore LoanAmount vs categorical variables
g = sns.factorplot(y="LoanAmount",x="Credit_History",data=train_df,kind="box",hue="Gender")
g = sns.factorplot(y="LoanAmount",x="Property_Area",data=train_df,kind="box", hue="Dependents")
g = sns.factorplot(y="LoanAmount",x="Gender", data=train_df,kind="box",hue="Married")
g = sns.factorplot(y="LoanAmount",x="Married", data=train_df,kind="box")
g = sns.factorplot(y="LoanAmount",x="Education", data=train_df,kind="box")
g = sns.factorplot(y="LoanAmount",x="Self_Employed", data=train_df,kind="box")
g = sns.factorplot(y="LoanAmount",x="Dependents", data=train_df,kind="box")

In [None]:
#Column - LoanAmount. LoanAmount depends upon Property_Area, gender, married, education, self employed and dependent columns. 
# Hence we will group them by above columns and imput median values. In case if the median is null then we will impute median of the entire LoanAmount column.

#Impute LoanAmount for train
index_NaN_LoanAmount = list(train_df["LoanAmount"][train_df["LoanAmount"].isnull()].index)

for i in index_NaN_LoanAmount :
    LoanAmount_med = train_df["LoanAmount"].median() # find median of entire LoanAmount column
    LoanAmount_pred = train_df["LoanAmount"][((train_df['Property_Area'] == train_df.iloc[i]["Property_Area"])
                                          & (train_df['Gender'] == train_df.iloc[i]["Gender"])
                                          & (train_df['Married'] == train_df.iloc[i]["Married"])
                                          & (train_df['Education'] == train_df.iloc[i]["Education"])
                                          & (train_df['Self_Employed'] == train_df.iloc[i]["Self_Employed"])
                                          & (train_df['Dependents'] == train_df.iloc[i]["Dependents"]))].median()
    if not np.isnan(LoanAmount_pred) :
        train_df['LoanAmount'].iloc[i] = LoanAmount_pred
    else :
        train_df['LoanAmount'].iloc[i] = LoanAmount_med
        
#Impute Loan amount for test
index_NaN_LoanAmount_test = list(test_df["LoanAmount"][test_df["LoanAmount"].isnull()].index)

for i in index_NaN_LoanAmount_test :
    LoanAmount_med = test_df["LoanAmount"].median()
    LoanAmount_pred = test_df["LoanAmount"][((test_df['Property_Area'] == test_df.iloc[i]["Property_Area"])
                                          & (test_df['Gender'] == test_df.iloc[i]["Gender"])
                                          & (test_df['Married'] == test_df.iloc[i]["Married"])
                                          & (test_df['Education'] == test_df.iloc[i]["Education"])
                                          & (test_df['Self_Employed'] == test_df.iloc[i]["Self_Employed"])
                                          & (test_df['Dependents'] == test_df.iloc[i]["Dependents"]))].median()
    if not np.isnan(LoanAmount_pred) :
        test_df['LoanAmount'].iloc[i] = LoanAmount_pred
    else :
        test_df['LoanAmount'].iloc[i] = LoanAmount_med

##### Check for any missing values

##### Training Data

In [None]:
get_missing_data(train_df)

##### Testing Data

In [None]:
get_missing_data(test_df)

#### Label encode categorical variables

In [None]:
train_df["Dependents"] = train_df["Dependents"].map({"0": 0, "1": 1,"2": 2, "3+": 3})
train_df["Property_Area"] = train_df["Property_Area"].map({"Rural":0, "Semiurban":1, "Urban": 2,})

test_df["Dependents"] = test_df["Dependents"].map({"0": 0, "1": 1,"2": 2, "3+": 3})
test_df["Property_Area"] = test_df["Property_Area"].map({"Rural":0, "Semiurban":1, "Urban": 2,})

#### Since LoanAmount is in thousands, lets multiply LoanAmount column with 1000

In [None]:
train_df['LoanAmount'] = train_df['LoanAmount'] * 1000
test_df['LoanAmount'] = test_df['LoanAmount'] * 1000

In [None]:
#Final Pair plot
sns.heatmap(train_df.corr(),annot=True,cmap='RdYlGn',linewidths=0.2,annot_kws={'size':12})
fig=plt.gcf()
fig.set_size_inches(10,6)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()

In [None]:
#Combine train and test datasets
combine_set=pd.concat([train_df,test_df], ignore_index=True)
combine_set.shape

### FEATURE ENGINEERING

In [None]:
#Add new features - Total income
combine_set['Total_Income'] = combine_set['ApplicantIncome'] + combine_set['CoapplicantIncome']

#Dropping unwanted columns - 'ApplicantIncome','CoapplicantIncome'
combine_set.drop(['ApplicantIncome','CoapplicantIncome'],axis=1,inplace=True)

# Categorical variables wise sum of numerical columns
combine_set['Credit_History_Income_Sum']=combine_set.groupby(['Credit_History'])['Total_Income'].transform('sum')
combine_set['Dependents_LoanAmount_Sum']=combine_set.groupby(['Dependents'])['LoanAmount'].transform('sum')

#EMI
#Lets assume that interest rate=10.0 # hence r = ((10/12)/100) = 0.00833

r = 0.00833
combine_set['EMI']=combine_set.apply(lambda x: (x['LoanAmount']*r*((1+r)**x['Loan_Amount_Term']))/((1+r)**((x['Loan_Amount_Term'])-1)),axis=1)

# Categorical variables wise mean of EMI
combine_set['Dependents_EMI_mean']=combine_set.groupby(['Dependents'])['EMI'].transform('mean')

# LoanAmount_per_Total_Income
combine_set['LoanAmount_per_Total_Income']=combine_set['LoanAmount']/combine_set['Total_Income']

# Loan_Amount_Term_per_Total_Income
combine_set['Loan_Amount_Term_per_Total_Income']=combine_set['Loan_Amount_Term']/combine_set['Total_Income']

# EMI_per_Loan_Amount_Term
combine_set['EMI_per_Loan_Amount_Term']=combine_set['EMI']/combine_set['Loan_Amount_Term']

# EMI_per_LoanAmount
combine_set['EMI_per_LoanAmount']=combine_set['EMI']/combine_set['LoanAmount']

# Categorical variables wise mean of LoanAmount_per_Total_Income
combine_set['Property_Area_LoanAmount_per_Total_Income_mean']=combine_set.groupby(['Property_Area'])['LoanAmount_per_Total_Income'].transform('mean')


################################# Bin formation ###############################################

Loan_Amount_Term_discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
combine_set['Loan_Amount_Term_Bins'] = Loan_Amount_Term_discretizer.fit_transform(combine_set['Loan_Amount_Term'].values.reshape(-1,1)).astype(float)

Total_Income_discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
combine_set['Total_Income_Bins'] = Total_Income_discretizer.fit_transform(combine_set['Total_Income'].values.reshape(-1,1)).astype(float)

LoanAmount_per_Total_Income_discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
combine_set['LoanAmount_per_Total_Income_Bins'] = LoanAmount_per_Total_Income_discretizer.fit_transform(combine_set['LoanAmount_per_Total_Income'].values.reshape(-1,1)).astype(float)

In [None]:
#Drop unwanted columns
combine_set=combine_set.drop(['EMI'],axis=1)
combine_set=combine_set.drop(['Total_Income'],axis=1)
combine_set=combine_set.drop(['LoanAmount_per_Total_Income'],axis=1)

In [None]:
combine_set

In [None]:
#Segregate train and test
train_df=combine_set[combine_set['Loan_Status'].isnull()==False]
test_df=combine_set[combine_set['Loan_Status'].isnull()==True]

In [None]:
#Label encode categorical variables using get_dummies()
train_df = pd.get_dummies(train_df, drop_first = True)
test_df = pd.get_dummies(test_df, drop_first = True)

In [None]:
train_df.info()

In [None]:
#Drop target variable from X and copy to y
X = train_df.drop(['Loan_Status'],axis=1)
y = train_df['Loan_Status']

#Drop target column (which is blank) from test dataset
X_main_test=test_df.drop(['Loan_Status'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42)

In [None]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

### Custom class for prediction and probability 

In [None]:
class SklearnModelWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        self.model = model
    
    def predict(self, context, model_input):
        import logging
        logger = logging.getLogger()
        probability = self.model.predict_proba(model_input)[:,1]
        status = self.model.predict(model_input)
        
        output = [ {"probability": probability[i], "Loan_Status": status[i] } for i in range(0, len(probability))]
        
        return output

### ML

#### Random Forest

In [None]:
with mlflow.start_run(run_name='Random Forest Classifier', experiment_id=EXPERIMENT_ID):
    random_forest = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_leaf = 10)
    random_forest.fit(X_train, y_train)

    predicted_qualities = random_forest.predict(X_test)

    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)
    accuracy = round(random_forest.score(X, y) * 100, 2)
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)
    print("  Accuracy: %s" % accuracy)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", accuracy)
    
    wrappedModel = SklearnModelWrapper(random_forest)
    mlflow.pyfunc.log_model("model", python_model=wrappedModel, conda_env=conda_env)

#### Logistic Regression

In [None]:
with mlflow.start_run(run_name='Logistic Regression', experiment_id=EXPERIMENT_ID):
    logreg = LogisticRegression(solver='lbfgs', max_iter=110)
    logreg.fit(X_train, y_train)

    predicted_qualities = logreg.predict(X_test)

    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)
    accuracy = round(logreg.score(X, y) * 100, 2)
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)
    print("  Accuracy: %s" % accuracy)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", accuracy)
    
    wrappedModel = SklearnModelWrapper(logreg)
    mlflow.pyfunc.log_model("model", python_model=wrappedModel, conda_env=conda_env)

#### Gaussian Naive Bayes

In [None]:
with mlflow.start_run(run_name='Gaussian Naive Bayes', experiment_id=EXPERIMENT_ID):
    gaussian = GaussianNB()
    gaussian.fit(X_train, y_train)

    predicted_qualities = gaussian.predict(X_test)

    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)
    accuracy = round(gaussian.score(X, y) * 100, 2)
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)
    print("  Accuracy: %s" % accuracy)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", accuracy)
    wrappedModel = SklearnModelWrapper(gaussian)
    mlflow.pyfunc.log_model("model", python_model=wrappedModel, conda_env=conda_env)

#### SVM

In [None]:
with mlflow.start_run(run_name='SVM', experiment_id=EXPERIMENT_ID):
    linear_svc = SVC(gamma='auto',probability=True)
    linear_svc.fit(X_train, y_train)

    predicted_qualities = linear_svc.predict(X_test)

    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)
    accuracy = round(linear_svc.score(X, y) * 100, 2)
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)
    print("  Accuracy: %s" % accuracy)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", accuracy)
    wrappedModel = SklearnModelWrapper(linear_svc)
    mlflow.pyfunc.log_model("model", python_model=wrappedModel, conda_env=conda_env)

#### Decision Tree

In [None]:
with mlflow.start_run(run_name='Decision Tree', experiment_id=EXPERIMENT_ID):
    decision_tree = DecisionTreeClassifier(criterion="entropy", max_depth=5)
    decision_tree.fit(X_train, y_train)

    predicted_qualities = decision_tree.predict(X_test)

    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)
    accuracy = round(decision_tree.score(X, y) * 100, 2)
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)
    print("  Accuracy: %s" % accuracy)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", accuracy)
    wrappedModel = SklearnModelWrapper(decision_tree)
    mlflow.pyfunc.log_model("model", python_model=wrappedModel, conda_env=conda_env)

#### Linear Discriminant Analysis

In [None]:
with mlflow.start_run(run_name='Linear Discriminant Analysis', experiment_id=EXPERIMENT_ID):
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train, y_train)

    predicted_qualities = lda.predict(X_test)

    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)
    accuracy = round(lda.score(X, y) * 100, 2)
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)
    print("  Accuracy: %s" % accuracy)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", accuracy)
    wrappedModel = SklearnModelWrapper(lda)
    mlflow.pyfunc.log_model("model", python_model=wrappedModel, conda_env=conda_env)

#### KNN

In [None]:
with mlflow.start_run(run_name='KNN', experiment_id=EXPERIMENT_ID):
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)

    predicted_qualities = knn.predict(X_test)

    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)
    accuracy = round(knn.score(X, y) * 100, 2)
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)
    print("  Accuracy: %s" % accuracy)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", accuracy)
    wrappedModel = SklearnModelWrapper(knn)
    mlflow.pyfunc.log_model("model", python_model=wrappedModel, conda_env=conda_env)

#### AdaBoost

In [None]:
with mlflow.start_run(run_name='AdaBoost', experiment_id=EXPERIMENT_ID):
    adaboost = AdaBoostClassifier()
    adaboost.fit(X_train, y_train)

    predicted_qualities = adaboost.predict(X_test)

    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)
    accuracy = round(adaboost.score(X, y) * 100, 2)
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)
    print("  Accuracy: %s" % accuracy)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("accuracy", accuracy)
    wrappedModel = SklearnModelWrapper(adaboost)
    mlflow.pyfunc.log_model("model", python_model=wrappedModel, conda_env=conda_env)

## Results

In [None]:
best_run_df = mlflow.search_runs(experiment_ids=[EXPERIMENT_ID], order_by=['metrics.accuracy DESC'], max_results=1)
best_run = mlflow.get_run(best_run_df.at[0, 'run_id'])
best_model_uri = f"{best_run.info.artifact_uri}/model"
best_model = pyfunc.load_model(best_model_uri)

print("Best run info:")
print(f"Run id: {best_run.info.run_id}")
print("Run Accuracy:  = {:.4f}\n".format(best_run.data.metrics['accuracy']))
print(f"Run model URI: {best_model_uri}")