In [13]:
# importing necessary libraries
import pandas as pd 
import numpy as np

from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline

from sklearn.metrics import r2_score

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings


# ===== for feature engineering ===== #
# For imputation
from feature_engine.imputation import (
    CategoricalImputer,
    AddMissingIndicator,
    MeanMedianImputer
)

# For data encoding
from feature_engine.encoding import (
    OneHotEncoder
)

# For Data transformation
from feature_engine.transformation import LogCpTransformer
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('loan-amount-train.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
# formatting colum names and data for consistency
data.columns = data.columns.str.lower()

In [21]:
list(data.columns)

['loan_id',
 'gender',
 'married',
 'dependents',
 'education',
 'self_employed',
 'applicantincome',
 'coapplicantincome',
 'loanamount',
 'loan_amount_term',
 'credit_history',
 'property_area',
 'loan_status']

In [8]:
# drop the ID, gender and loan status columns as they are irrelevant to our modeling
df = data.drop(['loan_id', 'gender', 'loan_status'], axis=1)
print('original data shape: {} , new data shape:{} '.format(data.shape, df.shape))

original data shape: (614, 13) , new data shape:(614, 10) 


In [9]:
# drop records where credit history is not available
df.dropna(subset=['credit_history'], inplace=True)
# converting the credit history variable to categorical
df['credit_history'] = df['credit_history'].astype('str')

# Configuration

In [15]:
NUM_VARS = ['applicantincome', 'coapplicantincome', 'loan_amount_term']
CAT_VARS = ['married',
 'dependents',
 'education',
 'self_employed',
 'credit_history',
 'property_area']
CAT_VARS_NA =  ['married', 'dependents', 'self_employed']
NUM_VAR_NA =  ['loan_amount_term']
NUM_CONT_VARS = ['applicantincome', 'coapplicantincome']

# Split Data to Train and Val

In [11]:
x_train,x_val,y_train,y_val = train_test_split(
                                df.drop(['loanamount'], axis=1),
                                df['loanamount'],
                                random_state=1,
                                test_size=.2
)
x_train.shape,x_val.shape, y_train.shape, y_val.shape 

((451, 9), (113, 9), (451,), (113,))

# Target

In [12]:
# Obtain the mode of the target variabke and use it in filling missing data
mode  = int(y_train.mode())

# to address missing values in loanamount, which is  our target variable
# we can replace nan with 0
y_train = y_train.fillna(mode)
y_val = y_val.fillna(mode)

# we apply logarithm transformation
y_train = np.log1p(y_train)
y_val = np.log1p(y_val)

# Pipeline

In [18]:
loan_amount_pipeline =  Pipeline(
    [
           # ===== IMPUTATION =====
    # add missing indicator to numerical variables
    ('missing_indicator', AddMissingIndicator(variables=NUM_VAR_NA)),

    # impute numerical variables with the median
    ('median_imputation', MeanMedianImputer(
        imputation_method='median', variables=NUM_VAR_NA)),

    # impute categorical variables with string missing
    ('categorical_imputation', CategoricalImputer(
        imputation_method='missing', variables=CAT_VARS_NA)),

    # == CATEGORICAL ENCODING ======

    # encode categorical variables using one hot encoding into k-1 variables
    ('categorical_encoder', OneHotEncoder(
        drop_last=True, variables=CAT_VARS)),
    
    # === variable transformation ======

    ('log', LogCpTransformer(variables=NUM_CONT_VARS, C=1)),

    # scale
    ('scaler', StandardScaler()),

    # Adding our final estimator
    ('SGD', SGDRegressor(alpha= 0.0001, max_iter= 1000,  random_state=1)),
    
    ]
)

In [19]:
# train the pipeline
loan_amount_pipeline.fit(x_train, y_train)

In [20]:
# make predictions for train set
class_ = loan_amount_pipeline.predict(x_train)

print('train accuracy: {}'.format(r2_score(y_train, class_)))
print()

# make predictions for test set
class_ = loan_amount_pipeline.predict(x_val)

# determine mse and rmse
print('val accuracy: {}'.format(r2_score(y_val, class_)))
print()

train accuracy: 0.3975448099119875

val accuracy: 0.38518664421723614

