### Loan Prediction Problem
##### Goutham 21BAI1007

In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error

In [None]:
# Reading the testing and training dataset
train = pd.read_csv('train_ctrUa4K.csv')
test = pd.read_csv('test_lAUu6dG.csv')

train.shape, test.shape

((614, 13), (367, 12))

In [None]:
submission = pd.read_csv('sample_submission_49d68Cx.csv')
submission['Loan_ID'] = test['Loan_ID']

### Data Preprocessing

In [None]:
# Dropping the column 'loan id'
train.drop(columns = ['Loan_ID'], inplace=True)
test.drop(columns = ['Loan_ID'], inplace=True)

train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [None]:
# Analysing the missing values
missing= train.isnull().sum()
missing.sort_values(ascending= False, inplace=True)
missing

Credit_History       50
Self_Employed        32
LoanAmount           22
Dependents           15
Loan_Amount_Term     14
Gender               13
Married               3
Education             0
ApplicantIncome       0
CoapplicantIncome     0
Property_Area         0
Loan_Status           0
dtype: int64

In [None]:
# Filling the missing values
train['Gender'] = train['Gender'].fillna('Male')
train['Married'] = train['Married'].fillna('Yes')
train['Dependents'] = train['Dependents'].fillna(train['Dependents'].mode()[0])
train['Self_Employed'] = train['Self_Employed'].fillna('No')
train['LoanAmount'] = train['LoanAmount'].fillna(train['LoanAmount'].mean())
train['Loan_Amount_Term'] = train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0])
train['Credit_History'] = train['Credit_History'].fillna(train['Credit_History'].mode()[0])

In [None]:
test['Gender'] = test['Gender'].fillna('Male')
test['Married'] = test['Married'].fillna('Yes')
test['Dependents'] = test['Dependents'].fillna(test['Dependents'].mode()[0])
test['Self_Employed'] = test['Self_Employed'].fillna('No')
test['LoanAmount'] = test['LoanAmount'].fillna(test['LoanAmount'].mean())
test['Loan_Amount_Term'] = test['Loan_Amount_Term'].fillna(test['Loan_Amount_Term'].mode()[0])
test['Credit_History'] = test['Credit_History'].fillna(test['Credit_History'].mode()[0])

In [None]:
#Encoding categorical values
label_encoder = preprocessing.LabelEncoder()
train['Gender'] = label_encoder.fit_transform(train['Gender'])
train['Gender'].unique()
train['Married'] = label_encoder.fit_transform(train['Married'])
train['Married'].unique()
train['Dependents'] = label_encoder.fit_transform(train['Dependents'])
train['Dependents'].unique()
train['Education'] = label_encoder.fit_transform(train['Education'])
train['Education'].unique()
train['Self_Employed'] = label_encoder.fit_transform(train['Self_Employed'])
train['Self_Employed'].unique()
train['Property_Area'] = label_encoder.fit_transform(train['Property_Area'])
train['Property_Area'].unique()
train['Loan_Status'] = label_encoder.fit_transform(train['Loan_Status'])
train['Loan_Status'].unique()
train['Loan_Amount_Term'] = label_encoder.fit_transform(train['Loan_Amount_Term'])
train['Loan_Amount_Term'].unique()

array([8, 4, 6, 5, 2, 7, 9, 1, 3, 0])

In [None]:
test['Gender'] = label_encoder.fit_transform(test['Gender'])
test['Gender'].unique()
test['Married'] = label_encoder.fit_transform(test['Married'])
test['Married'].unique()
test['Dependents'] = label_encoder.fit_transform(test['Dependents'])
test['Dependents'].unique()
test['Education'] = label_encoder.fit_transform(test['Education'])
test['Education'].unique()
test['Self_Employed'] = label_encoder.fit_transform(test['Self_Employed'])
test['Self_Employed'].unique()
test['Property_Area'] = label_encoder.fit_transform(test['Property_Area'])
test['Property_Area'].unique()
test['Loan_Amount_Term'] = label_encoder.fit_transform(test['Loan_Amount_Term'])
test['Loan_Amount_Term'].unique()

array([10,  7,  6,  3, 11,  4,  1,  8,  9,  2,  5,  0])

In [None]:
train.dtypes

Gender                 int64
Married                int64
Dependents             int64
Education              int64
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term       int64
Credit_History       float64
Property_Area          int64
Loan_Status            int64
dtype: object

In [None]:
# Removing outliers
z_scores = train.apply(lambda x: (x - x.mean()) / x.std())
threshold = 3
train = train[z_scores < threshold]
train.isna().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       8
CoapplicantIncome     6
LoanAmount           15
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

In [None]:
train['ApplicantIncome'].fillna(train['ApplicantIncome'].mean(), inplace=True)
train['CoapplicantIncome'].fillna(train['CoapplicantIncome'].mean(), inplace=True)
train['LoanAmount'].fillna(train['LoanAmount'].mean(), inplace=True)

In [None]:
z_scores = test.apply(lambda x: (x - x.mean()) / x.std())
threshold = 3
test = test[z_scores < threshold]
test.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      4
CoapplicantIncome    5
LoanAmount           6
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [None]:
test['ApplicantIncome'].fillna(test['ApplicantIncome'].mean(), inplace=True)
test['CoapplicantIncome'].fillna(test['CoapplicantIncome'].mean(), inplace=True)
test['LoanAmount'].fillna(test['LoanAmount'].mean(), inplace=True)

### Creating the model

In [None]:
X= train.drop('Loan_Status', axis=1)
y = train['Loan_Status']
X_test = test

In [None]:
X_train, X_val, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression()
regressor.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
regressor.intercept_, regressor.coef_

(array([-0.06977404]),
 array([[-2.99744003e-01,  4.08129086e-01,  1.76318176e-01,
         -4.03026112e-01, -5.73824900e-02,  3.52930062e-05,
          1.65891138e-04, -6.30644573e-03, -1.20386570e-01,
          2.98821228e+00, -1.50469039e-01]]))

In [None]:
# Calling the built in predict function
y_pred = regressor.predict(X_test)

In [None]:
# from sklearn.metrics import accuracy_score
# round(accuracy_score(y_test, y_pred)*100, 2)

In [None]:
# print("R Square ", regressor.score(X_test, y_test))

In [None]:
# print("MAE:", metrics.mean_absolute_error(y_pred, y_test))

In [None]:
submission['Loan_Status'] = y_pred
submission['Loan_Status'] = submission['Loan_Status'].map({1: 'Y', 0: 'N'})
submission.to_csv('21BAI1007.csv', index=False)