In [8]:
import numpy as np
import pandas as pd
from sklearn import set_config
set_config(display="diagram")
from sklearn.pipeline import make_pipeline, make_union
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import StratifiedKFold

In [9]:
df = pd.read_csv('train_ctrUa4K.csv')
X = df.drop(['Loan_Status'], axis=1)
y = df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

numeric_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

numeric_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)
categorical_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)
preprocessor = make_column_transformer(
    (numeric_transformer, numeric_cols),
    (categorical_transformer, categorical_cols)
)
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])
pipe.fit(X_train, y_train);
y_pred = pipe.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.7967479674796748


In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_num = 1
scores = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    scores.append(score)
    print(f"Accuracy for fold {fold_num}: ", score)
    fold_num += 1

Accuracy for fold 1:  0.7886178861788617
Accuracy for fold 2:  0.7967479674796748
Accuracy for fold 3:  0.7560975609756098
Accuracy for fold 4:  0.7235772357723578
Accuracy for fold 5:  0.7704918032786885


In [11]:
from sklearn.model_selection import GridSearchCV

params={
    'clf__n_estimators':[100, 200, 500],
    'clf__max_depth': [5, 6, 7, 8]
}
grid_pipe = GridSearchCV(pipe,
                         param_grid=params,
                         cv=5,
                         verbose=1)

grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_)
print(grid_pipe.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'clf__max_depth': 6, 'clf__n_estimators': 100}
0.8070294784580498
