In [65]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# imports for custom functions
from sklearn.preprocessing import FunctionTransformer  # wrapper to fit custom functions into a pipeline
    # Apparently to fully integrate your custom transformer into scikit-learn's ecosystem, it is recommended to use 
    # the TransformerMixin and BaseEstimator classes in addition to FunctionTransformer, but I don't see how I need them

# import all the modeling stuff
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# import functions
import cleaning_functions as cf

# import clean data to clean in the pipeline
df = pd.read_csv("../data/data.csv")

In [66]:
print(df.shape)
print(df.dtypes)
df.head()

(614, 13)
Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [67]:
# transform Education
# not in pipeline as only applies to one column
df["Education"] = cf.reverse_label_encoding(df.Education)

In [68]:
# define preprocessing pipeline
preprocessing_pipeline = Pipeline( [
                                    ("drop_loanamt_if_Nan", FunctionTransformer(cf.drop_if_null) ),
                                    ("impute_values", FunctionTransformer(cf.impute_values) ),
                                    #("label_encode", FunctionTransformer(cf.label_encoding) ),  # doesn't work
                                    ("hot_encode", FunctionTransformer(cf.one_hot_encoding) ),
                                    ("transform_income", FunctionTransformer(cf.total_logged_income) )
                                    ]
                                )

In [69]:
# apply preprocessing_pipeline to df

# apparently train-test split should normally be done BEFORE doing pre-processing to avoid leakage
# but sklearn train-test split requires the x and y to be split at the same time as train/test
# my y also needs preprocessing, and I need to select the features for x after pre-processing
# so I will fit and transform the entire dataset using the pipeline

X_processed = preprocessing_pipeline.fit_transform(df)
X_processed.columns

Index(['Loan_ID', 'Gender', 'Married', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Loan_Status',
       'Property_Area_Rural', 'Property_Area_Semiurban', 'Property_Area_Urban',
       'Dependents_0', 'Dependents_1', 'Dependents_2', 'Dependents_3+',
       'TotalIncome', 'TotalIncome_log'],
      dtype='object')

In [70]:
# LabelEncoder outside the pipeline - for some reason, it doesn't work within it
le = LabelEncoder()
cols_to_encode = ['Married', 'Gender','Self_Employed', 'Credit_History', 'Loan_Status']
for col in cols_to_encode:
    X_processed[col] = le.fit_transform(X_processed[col])

X_processed.columns

Index(['Loan_ID', 'Gender', 'Married', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Loan_Status',
       'Property_Area_Rural', 'Property_Area_Semiurban', 'Property_Area_Urban',
       'Dependents_0', 'Dependents_1', 'Dependents_2', 'Dependents_3+',
       'TotalIncome', 'TotalIncome_log'],
      dtype='object')

In [71]:
# split x and y variables

y = X_processed["Loan_Status"]

X = X_processed [['Gender', 'Married', 'Dependents_0', 'Dependents_1', 'Dependents_2', 'Dependents_3+', \
                           'Education', 'Self_Employed', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', \
                           'Property_Area_Rural', 'Property_Area_Semiurban', 'Property_Area_Urban', 'TotalIncome_log']]


In [72]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)
