In [1]:
# importing necessary libraries
import pandas as pd 
import numpy as np

from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import r2_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import warnings


# ===== for feature engineering ===== #
# For imputation
from feature_engine.imputation import (
    CategoricalImputer,
    AddMissingIndicator,
    MeanMedianImputer
)

# For data encoding
from feature_engine.encoding import (
    OrdinalEncoder
)

# For Data transformation
from feature_engine.transformation import LogCpTransformer
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('loan-amount-train.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
# formatting colum names and data for consistency
data.columns = data.columns.str.lower()

In [5]:
list(data.columns)

['loan_id',
 'gender',
 'married',
 'dependents',
 'education',
 'self_employed',
 'applicantincome',
 'coapplicantincome',
 'loanamount',
 'loan_amount_term',
 'credit_history',
 'property_area',
 'loan_status']

In [6]:
# drop the ID, gender and loan status columns as they are irrelevant to our modeling
df = data.drop(['loan_id', 'gender', 'loan_status'], axis=1)
print('original data shape: {} , new data shape:{} '.format(data.shape, df.shape))

original data shape: (614, 13) , new data shape:(614, 10) 


In [7]:
# converting the credit history variable to categorical
df['credit_history'] = df['credit_history'].astype('str')

# Configuration

In [8]:
NUM_VARS = ['applicantincome', 'coapplicantincome', 'loan_amount_term']
CAT_VARS = ['married',
 'dependents',
 'education',
 'self_employed',
 'credit_history',
 'property_area']
CAT_VARS_NA =  ['married', 'dependents', 'self_employed']
NUM_VAR_NA =  ['loan_amount_term']
NUM_CONT_VARS = ['applicantincome', 'coapplicantincome']
CREDIT_VAR_MAPPER = ['credit_history']

In [9]:
MAPPER = {
    '1.0': 'yes', 
    '0.0': 'no',
    'nan': 'no'
}

from sklearn.base import BaseEstimator, TransformerMixin

class Mapper(BaseEstimator, TransformerMixin):
    def __init__(self, variable:list[str], mapping: dict):
        self.mapping = mapping
        self.variable = variable
        super().__init__()

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variable:
            X[var] = X[var].map(self.mapping)
        return X

# Split Data to Train and Val

In [10]:
x_train,x_val,y_train,y_val = train_test_split(
                                df.drop(['loanamount'], axis=1),
                                df['loanamount'],
                                random_state=1,
                                test_size=.2
)
x_train.shape,x_val.shape, y_train.shape, y_val.shape 

((491, 9), (123, 9), (491,), (123,))

In [11]:
x_train.head()

Unnamed: 0,married,dependents,education,self_employed,applicantincome,coapplicantincome,loan_amount_term,credit_history,property_area
291,Yes,2,Graduate,No,4400,0.0,360.0,0.0,Semiurban
507,No,0,Graduate,No,3583,0.0,360.0,1.0,Urban
328,Yes,0,Graduate,No,4333,2451.0,360.0,1.0,Urban
609,No,0,Graduate,No,2900,0.0,360.0,1.0,Rural
69,No,0,Graduate,No,4300,0.0,360.0,0.0,Semiurban


# Target

In [12]:
# Obtain the mode of the target variabke and use it in filling missing data
mode  = int(y_train.mode())

# to address missing values in loanamount, which is  our target variable
# we can replace nan with 0
y_train = y_train.fillna(mode)
y_val = y_val.fillna(mode)

# we apply logarithm transformation
y_train = np.log1p(y_train)
y_val = np.log1p(y_val)

# Pipeline

In [13]:
from sklearn.pipeline import Pipeline
from typing import Mapping
loan_amount_pipeline =  Pipeline(
    [
           # ===== IMPUTATION =====
    # add missing indicator to numerical variables
    ('missing_indicator', AddMissingIndicator(variables=NUM_VAR_NA)),

    # impute numerical variables with the median
    ('median_imputation', MeanMedianImputer(
        imputation_method='median', variables=NUM_VAR_NA)),

    # impute categorical variables with string missing
    ('categorical_imputation', CategoricalImputer(
        imputation_method='missing', variables=CAT_VARS_NA)),

    # Mapping our credit history variable values
    ('credit_history_mapping', Mapper(
        variable=CREDIT_VAR_MAPPER, mapping=MAPPER)),

    # == CATEGORICAL ENCODING ======

    # encode categorical variables using one hot encoding into k-1 variables
    ('categorical_encoder', OrdinalEncoder(
        encoding_method='arbitrary',
        variables=CAT_VARS)),
    
    # === variable transformation ======

    ('log', LogCpTransformer(variables=NUM_CONT_VARS, C=1)),

    # scale
    ('scaler', MinMaxScaler()),

    # Adding our final estimator
    ('SGD', LassoCV(
        cv = 5,
        eps =  0.0075,
        tol =  0.0074,
        max_iter= 100,  
        random_state=1)),
    
    ]
)

In [14]:
# train the pipeline
loan_amount_pipeline.fit(x_train, y_train)

In [36]:
# make predictions for train set
class_ = loan_amount_pipeline.predict(x_train)

print('train accuracy: {}'.format(r2_score(y_train, class_)))
print()

# make predictions for test set
class_ = loan_amount_pipeline.predict(x_val)

# determine mse and rmse
print('val accuracy: {}'.format(r2_score(y_val, class_)))
print()

train accuracy: 0.3890532461196545

val accuracy: 0.35281510610046607



In [47]:
y_train.mode(0)

0    4.70953
Name: loanamount, dtype: float64

# Predicting new data 

In [37]:
test_data =  pd.read_csv('loan-amount-test.csv')
test_data.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [38]:
test_data.shape

(367, 12)

In [39]:
test_data.columns = test_data.columns.str.lower()
test = test_data.drop(['loan_id', 'gender', 'loanamount'], axis=1)
# drop records where credit history is not available
test.dropna(subset=['credit_history'], inplace=True)
# converting the credit history variable to categorical
test['credit_history'] = test['credit_history'].astype('str')

In [40]:
test.shape

(338, 9)

In [41]:
test_pred = loan_amount_pipeline.predict(test)
a = list(np.expm1(test_pred))
np.round(a[0], 2)

124.72

In [24]:
# a = data[1:2].to_dict(orient='records')
# a