In [11]:
import pandas as pd

# Import the dataset
dataset = pd.read_csv('Loan_Train.csv')

# Check if the dataset loaded properly
print(dataset.head())




    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [12]:
# Drop the column "Load_ID"
dataset.drop('Loan_ID', axis=1, inplace=True)

# Converting specific categorical values to numeric values- I decided to convert text values as well.
categorical_mapping = {
    'Gender': {'Male': 1, 'Female': 2},
    'Married': {'No': 0, 'Yes': 1},
    'Dependents': {'0': 0, '1': 1, '2': 2, '3+': 3},
    'Education': {'Not Graduate': 0, 'Graduate': 1},
    'Property_Area': {'Rural': 0, 'Urban': 1},
    'Self_Employed': {'No': 0, 'Yes': 1},
    'Loan_Status': {'N': 0, 'Y': 1}
}

for col, mapping in categorical_mapping.items():
    dataset[col] = dataset[col].map(mapping)

# Display the modified dataset
print(dataset.head())


   Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  \
0     1.0      0.0         0.0          1            0.0             5849   
1     1.0      1.0         1.0          1            0.0             4583   
2     1.0      1.0         0.0          1            1.0             3000   
3     1.0      1.0         0.0          0            0.0             2583   
4     1.0      0.0         0.0          1            0.0             6000   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
0                0.0         NaN             360.0             1.0   
1             1508.0       128.0             360.0             1.0   
2                0.0        66.0             360.0             1.0   
3             2358.0       120.0             360.0             1.0   
4                0.0       141.0             360.0             1.0   

   Property_Area  Loan_Status  
0            1.0            1  
1            0.0            0  
2            1.0    

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target variable (y)
X = dataset.drop('Loan_Status', axis=1)
y = dataset['Loan_Status']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes
print("Training set - Features shape:", X_train.shape)
print("Training set - Target shape:", y_train.shape)
print("Test set - Features shape:", X_test.shape)
print("Test set - Target shape:", y_test.shape)


Training set - Features shape: (491, 11)
Training set - Target shape: (491,)
Test set - Features shape: (123, 11)
Test set - Target shape: (123,)


In [20]:
import warnings
warnings.filterwarnings("ignore")

In [21]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Separate features (X) and target variable (y)
X = dataset.drop('Loan_Status', axis=1)
y = dataset['Loan_Status']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with Min-Max scaler and KNN classifier
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', MinMaxScaler()),
    ('classifier', KNeighborsClassifier())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on the test data
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.7398373983739838


In [22]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer

# Create a pipeline with SimpleImputer, MinMaxScaler, and KNN classifier
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', MinMaxScaler()),
    ('classifier', KNeighborsClassifier())
])

# Define the search space - not sure if this method will work- may need to test in seperate code
search_space = [
    {'classifier': [KNeighborsClassifier()],
     'classifier__n_neighbors': range(1, 11)},
    {'classifier': [LogisticRegression()],
     'classifier__C': [0.1, 1.0, 10.0]},
    {'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [100, 200, 300]}
]

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, search_space, cv=5)
grid_search.fit(X_train, y_train)

# Print the best model and hyperparameters
print("Best model:", grid_search.best_estimator_)
print("Best hyperparameters:", grid_search.best_params_)

# Evaluate the best model on the test data
accuracy = grid_search.score(X_test, y_test)
print("Accuracy:", accuracy)

Best model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('scaler', MinMaxScaler()),
                ('classifier', LogisticRegression())])
Best hyperparameters: {'classifier': LogisticRegression(), 'classifier__C': 1.0}
Accuracy: 0.7886178861788617


After analyzing the grid search results, it's apparent that the top-performing model is a pipeline. This successful pipeline comprises a SimpleImputer, a MinMaxScaler and a Logistic Regression classifier working collaboratively. Additionally, this effective model has specific, pre-established hyperparameters which were included in its construction. These hyperparameters are as follows: 'classifier': LogisticRegression(), 'classifier__C': 1.0.

Perhaps the most impressive aspect of this best-fit model is its accuracy while being tested|

We tackled the task of building a classification model for our dataset by crafting an efficient pipeline. This naturally included useful steps to correct for scaling and to initialize a KNN classifier as default. Performing this operation culminated in success, and we applied evaluation metrics on the produced pipeline against our testing data resulting in a preliminary accuracy of 0.74.

Going further, we deemed it necessary to improve upon this preliminary score through exhaustive search algorithms. 