In [1]:
import pandas as pd 
import numpy as np  
from sklearn.compose import ColumnTransformer 
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, FunctionTransformer, OneHotEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import metrics

In [2]:
df=pd.read_csv('data.csv')
y = df['Loan_Status'].apply(lambda x:1 if x=='Y' else 0)
X=df.drop(columns=['Loan_Status'])

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)

In [3]:
## Transform DataFrame to create a new column ('Total Income')
class DataframeFunctionTransformer():
    def __init__(self, func):
        self.func = func
    def transform(self, input_df, **transform_params):
        return self.func(input_df)
    def fit(self, X, y=None, **fit_params):
       # print(X) # used this for testing
        return self
# this function takes a dataframe as input and
# returns a modified version thereof
def process_dataframe(input_df):
    input_df["Total_Income"] = input_df["ApplicantIncome"]+input_df['CoapplicantIncome']
    
    return input_df


In [4]:
class FeatureSelector:
    def __init__(self, feats):
        self.feats = feats

    def fit(self, X, y=None):
        pass


    def transform(self, X, y=None):
        return X[self.feats]

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

# features used for modeling, drop the Gender, Married, and add a new column 'total Income'
feats=['Dependents',
 'Education',
 'Self_Employed',
  
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area', 'Total_Income']

In [5]:
num_transformer=Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])
cat_transformer=Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder())])

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, ['LoanAmount', 'Loan_Amount_Term', 'Total_Income']),
        ('cat', cat_transformer, ['Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area'])])

In [9]:
pipe2=Pipeline(steps=[('Dataframe', DataframeFunctionTransformer(process_dataframe)), ('features',FeatureSelector(feats)), ('preprocessing', preprocessor), ('feature_selection', SelectKBest(k=5)),('model', RandomForestClassifier())])
pipe2.fit(X_train, y_train)
y_cfc=pipe2.predict(X_test)
print(metrics.f1_score(y_test, y_cfc))

0.8208092485549133


In [11]:
y_cfc=pipe2.predict(X_test)
print(f'The RandomForest has the precision score and recall score of {metrics.precision_score(y_test, y_cfc)}, {metrics.recall_score(y_test, y_cfc)}')


The RandomForest has the precision score and recall score of 0.7717391304347826, 0.8765432098765432


In [13]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


classifiers = [KNeighborsClassifier(3), SVC(kernel="rbf", C=0.025, probability=True),  DecisionTreeClassifier(), RandomForestClassifier(),GradientBoostingClassifier()]

for classifier in classifiers:
    pipe = Pipeline(steps=[('Dataframe', DataframeFunctionTransformer(process_dataframe)), ('features',FeatureSelector(feats)), ('preprocessing',               preprocessor), ('feature_selection', SelectKBest(k=5)), ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))
    

KNeighborsClassifier(n_neighbors=3)
model score: 0.699
SVC(C=0.025, probability=True)
model score: 0.659
DecisionTreeClassifier()
model score: 0.675
RandomForestClassifier()
model score: 0.748
GradientBoostingClassifier()
model score: 0.724


In [16]:
rfc_pipe= Pipeline(steps=[('Dataframe', DataframeFunctionTransformer(process_dataframe)),
 ('features',FeatureSelector(feats)), ('preprocessing', preprocessor), ('feature_selection', SelectKBest()),('model', RandomForestClassifier())])
params={'feature_selection__k':[3,4,5,6], 'model__max_depth':[3,4,5,6]}
rfc_GS=GridSearchCV(rfc_pipe, param_grid=params)
rfc_GS.fit(X_train, y_train)
y_gs=rfc_GS.predict(X_test)
print(rfc_GS.score(X_test, y_test))
print(rfc_GS.best_params_)
print(f' The GridSearch for RandomForest has the precision score and recall score of {metrics.precision_score(y_test, y_gs)}, {metrics.recall_score(y_test, y_gs)}')


0.7723577235772358
{'feature_selection__k': 3, 'model__max_depth': 3}
 The GridSearch for RandomForest has the precision score and recall score of 0.7523809523809524, 0.9753086419753086


In [17]:
import pickle
pickle.dump(pipe2,open('pipe2.pkl','wb'))