In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import pickle, joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import Counter 
from imblearn.over_sampling import SMOTE


Exploraory data analysis 

In [4]:
df=pd.read_csv("data.csv")

In [6]:
df.head()

# df[["Loan_Amount_Term"]].value_counts()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [7]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [8]:
df['Gender'].dtype

dtype('O')

In [9]:
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [10]:
df=df.dropna(subset=["Married"])
# since the married column have only just 3 nan value

In [11]:
df.isna().sum()

Loan_ID               0
Gender               13
Married               0
Dependents           12
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           21
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [12]:
# filling na values with mean value in loanAmount column
mean=np.mean(df["LoanAmount"])
df["LoanAmount"]=df["LoanAmount"].fillna(mean)

In [13]:
# filling na values with mode value in LoanAmount column
mode=df["Loan_Amount_Term"].mode()
df["Loan_Amount_Term"]=df["Loan_Amount_Term"].fillna(360)

converting the dependents columns to integer

In [15]:
mode=df["Dependents"].mode()
df["Dependents"]=df["Dependents"].replace({"3+":3,np.nan:0}).copy()
df["Dependents"]=pd.to_numeric(df["Dependents"]).copy()
# df["Dependents"].unique()


In [16]:
# checking for outliers
cols_outliers =["ApplicantIncome","CoapplicantIncome", "LoanAmount","Loan_Amount_Term"]
        

In [17]:
# def outlier_detect(df,column):
#     outlier_column=[]
#     for x in  df[column]:
#         z=np.divide(x-np.mean(df[column]),np.std(df[column]))
#         if z > 3:
#             outlier_column.append(x)
#     print(outlier_column) 


In [18]:
# for column in cols_outliers:
#     outlier_detect(df,column)

In [19]:
# dropping Loan_ID since it no useful info
df=df.drop("Loan_ID", axis=1).copy()

# converting all the values in credit history to object
df["Credit_History"]=df["Credit_History"].replace({0:"No",1:"yes"}).copy()
df["Loan_Status"]=df["Loan_Status"].replace({"Y":1,"N":0}).copy()


  df["Loan_Status"]=df["Loan_Status"].replace({"Y":1,"N":0}).copy()


In [20]:
# independent variable
X=df.iloc[:,:-1]
# dependent variable
y=df.iloc[:,-1:]

In [21]:
scaler=MinMaxScaler()
X1=pd.get_dummies(X[["Gender","Married","Self_Employed","Education","Credit_History","Property_Area"]], dummy_na=True)
X1=X1.drop(["Education_nan","Married_nan","Property_Area_nan"],axis=1).copy()
X2=X[["Dependents","ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term"]]
X=pd.concat([X2,X1],axis=1)
X=pd.DataFrame(scaler.fit_transform(X),columns=X.columns)
y=y.values.flatten()

In [22]:
counter=Counter(y)
counter

X

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,...,Self_Employed_Yes,Self_Employed_nan,Education_Graduate,Education_Not Graduate,Credit_History_No,Credit_History_yes,Credit_History_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.000000,0.070489,0.000000,0.198798,0.743590,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.333333,0.054830,0.036192,0.172214,0.743590,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.000000,0.035250,0.000000,0.082489,0.743590,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.000000,0.030093,0.056592,0.160637,0.743590,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.000000,0.072356,0.000000,0.191027,0.743590,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.000000,0.034014,0.000000,0.089725,0.743590,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
607,1.000000,0.048930,0.000000,0.044863,0.358974,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
608,0.333333,0.097984,0.005760,0.353111,0.743590,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
609,0.666667,0.091936,0.000000,0.257598,0.743590,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [23]:
oversample= SMOTE(random_state=42)
X,y=oversample.fit_resample(X,y)
counter=Counter(y)
counter

Counter({1: 419, 0: 419})

In [24]:
# estimator=RandomForestClassifier(n_jobs=-1,)
# param_grid={"max_depth":[3,4,5],"min_samples_split":[2,3,4], "min_samples_leaf":[1,2,3]
#             ,"max_samples":[0.7]}

# grid_search=GridSearchCV(estimator=estimator,param_grid=param_grid, cv=5, scoring="f1")
# grid_search.fit(X,y)
# best_result=grid_search.cv_results_


In [25]:
# result=pd.DataFrame(best_result)
# result.iloc[25,:]

In [26]:
# X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3, random_state=42)
# model=RandomForestClassifier(n_estimators=150,max_depth=7,min_samples_split=2,min_samples_leaf=2,max_samples=0.7,)
# # model=LogisticRegression()
# # model= SVC(kernel="poly")
# model.fit(X_train,y_train)
# y_pred=model.predict(X_test)

# reports=classification_report(y_test,y_pred, target_names=["minoriy class", "majoriy class"])

# reports


X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3, random_state=42)

      pre      precision    recall  f1-score   support\n\nminority_class       0.92      0.41      0.56        54\nmajority_class       0.80      0.98      0.88       130\n\n      accuracy                           0.82       184\n     macro avg       0.86      0.70      0.72       184\n  weighted avg       0.83      0.82      0.79       184\n'

MODELING
cision    recall  f1-score   support\n\nminority_class       0.92      0.41      0.56        54\nmajority_class       0.80      0.98      0.88       130\n\n      accuracy                           0.82       184\n     macro avg       0.86      0.70      0.72       184\n  weighted avg       0.83      0.82      0.79       184\n'

MODELING


MODELING

In [30]:
# RandomForest=RandomForestClassifier(n_estimators=150, max_depth=7,min_samples_split=2, min_samples_leaf=2, max_samples=0.7,random_state=2)
# LogisticRegres=LogisticRegression()
# Svm=SVC(kernel='poly')
# models=[RandomForest,LogisticRegres,Svm]

# for model in models:
#     score=cross_val_score(model,X,y, scoring="f1",cv=5,n_jobs=-1)
    # print(np.mean(score))



In [31]:
test_data=pd.read_csv("loan-test.csv")
test_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban
363,LP002975,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban
364,LP002980,Male,No,0,Graduate,No,3250,1993,126.0,360.0,,Semiurban
365,LP002986,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural


Cleaning and preparing the test data similar to the train data

In [33]:
test_df=test_data.iloc[:,1:].copy()
Loan_ID=test_data.iloc[:,0:1]

# filling the na values with the mean of the column
mean=np.mean(df["LoanAmount"])
test_df["LoanAmount"]=test_df["LoanAmount"].fillna(mean).copy()

# filling na values with mode value in LoanAmount column
mode=test_df["Loan_Amount_Term"].mode()
test_df["Loan_Amount_Term"]=test_df["Loan_Amount_Term"].fillna(360)

test_df["Dependents"]=test_df["Dependents"].replace({"3+":3,np.nan:0}).copy()
test_df["Dependents"]=pd.to_numeric(test_df["Dependents"]).copy()

test_df["Credit_History"]=test_df["Credit_History"].replace({0:"No",1:"yes"}).copy()

test_df2=test_df[["Dependents","ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term"]]
test_df1=pd.get_dummies(test_df[["Gender","Married","Self_Employed","Education","Credit_History","Property_Area"]], dummy_na=True)
test_df1.drop(["Education_nan","Married_nan","Property_Area_nan",], axis=1, inplace=True)

test_df=pd.concat([test_df2,test_df1],axis=1)
test_df=pd.DataFrame(scaler.fit_transform(test_df),columns=test_df.columns)
test_df.head()


Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,...,Self_Employed_Yes,Self_Employed_nan,Education_Graduate,Education_Not Graduate,Credit_History_No,Credit_History_yes,Credit_History_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.0,0.078865,0.0,0.157088,0.746835,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.333333,0.042411,0.0625,0.187739,0.746835,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.666667,0.068938,0.075,0.344828,0.746835,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.666667,0.032263,0.106083,0.137931,0.746835,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.045168,0.0,0.095785,0.746835,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


Predicting the test data

In [35]:
Model=RandomForestClassifier(n_estimators=200,max_depth=7,min_samples_split=2, 
                             min_samples_leaf=2,max_samples=0.7, random_state=2,n_jobs=-1)
Model.fit(X,y)
prediction=Model.predict(test_df)
result=pd.DataFrame({"Loan_ID":Loan_ID.values.flatten(),"prediction":prediction,})

result=result.replace({1:"Y", 0:"N"})
result

Unnamed: 0,Loan_ID,prediction
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,N
...,...,...
362,LP002971,Y
363,LP002975,Y
364,LP002980,Y
365,LP002986,Y


In [36]:
Counter(prediction)

Counter({1: 289, 0: 78})

In [37]:
pred=Model.predict(test_df.loc[[2]])
len(pred)
test_df.loc[[2]]
# if pred.item()==1:
#     print("yes")
# else:
#     print("no")
    

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,...,Self_Employed_Yes,Self_Employed_nan,Education_Graduate,Education_Not Graduate,Credit_History_No,Credit_History_yes,Credit_History_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
2,0.666667,0.068938,0.075,0.344828,0.746835,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


saving the model using pickle

In [39]:
# pickle.dump(Model,open("Model.pkl","wb"))
joblib.dump(Model,"Model.joblib")

['Model.joblib']

In [40]:
# Model=SVC(kernel='linear')
# Model.fit(X,y)

# def data_preps(df):
#     df1=df[["Dependents","ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term",]]
#     df2=pd.get_dummies(test_df[["Gender","Married","Self_Employed","Education","Credit_History","Property_Area"]], dummy_na=True)
#     df2=df2.drop(["Education_nan","Property_nan"],axis=1)
#     df3=pd.concat([df1,df2],axis=1)
#     df3=pd.DataFrame(scaler.fit_transform(df3),columns=test_df3.columns)
    
#     Model.fit(X,y)
#     prediction=Model.predict(df)
#     return prediction
    

In [41]:
data=({
    "a":1, "b":2, "c":3
})
df=pd.DataFrame(data, index=(0,))

df                
                

Unnamed: 0,a,b,c
0,1,2,3


In [42]:
np.arange(0)

array([], dtype=int64)