In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
import pickle



In [None]:
# Load dataset
train = pd.read_csv('train_ctrUa4K.csv')



In [None]:
# Handle missing values (imputation instead of dropping)
train.fillna(train.median(numeric_only=True), inplace=True)
train.fillna(train.mode().iloc[0], inplace=True)




In [None]:
# Encode categorical variables
train['Gender'] = train['Gender'].map({'Male': 0, 'Female': 1})
train['Married'] = train['Married'].map({'No': 0, 'Yes': 1})
train['Loan_Status'] = train['Loan_Status'].map({'N': 0, 'Y': 1})



In [None]:
# Feature selection
X = train[['Gender', 'Married', 'ApplicantIncome', 'LoanAmount', 'Credit_History']]
y = train['Loan_Status']

# Train-test split
x_train, x_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2, random_state=10)



In [None]:
# Hyperparameter tuning using RandomizedSearchCV
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=10)
random_search = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=20, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(x_train, y_train)



In [None]:
# Best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Train model with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=10)
best_rf.fit(x_train, y_train)



In [None]:
# Predictions
y_pred = best_rf.predict(x_cv)



In [None]:
# Model evaluation
accuracy = accuracy_score(y_cv, y_pred)
report = classification_report(y_cv, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)



In [None]:
# Save the best model
with open('classifier.pkl', 'wb') as f:
    pickle.dump(best_rf, f)