# **Import necessary libraries**

In [11]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# **Load the dataset**

In [12]:
df = pd.read_csv('loan_approval_dataset.csv')

# **Check the column names**

In [13]:
print(df.columns)

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')


# **Encode categorical variables**

In [14]:
df.drop('loan_id', axis=1, inplace=True)
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

In [15]:
df.drop(' no_of_dependents', axis=1, inplace=True)

In [16]:
df.head()

Unnamed: 0,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,0
1,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,1
2,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,1
3,0,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,1
4,1,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,1


# **Split the data into training and testing sets**

In [17]:
X = df.drop(' loan_status', axis=1)
y = df[' loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Train a model**

In [18]:
rf_classifier = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_classifier.fit(X_train, y_train)

# **Evaluate the model**

In [19]:
y_pred = rf_classifier.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9754098360655737
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       536
           1       0.98      0.96      0.97       318

    accuracy                           0.98       854
   macro avg       0.98      0.97      0.97       854
weighted avg       0.98      0.98      0.98       854

Confusion Matrix:
[[529   7]
 [ 14 304]]


# **Perform hyperparameter tuning**

In [20]:
param_grid = {'n_estimators': [100, 200, 300, 400, 500], 'max_depth': [5, 10, 15, 20, 25]}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print('Best Parameters:', grid_search.best_params_)
print('Best Score:', (grid_search.best_score_)*100)

Best Parameters: {'max_depth': 15, 'n_estimators': 400}
Best Score: 97.97950219619327


# **Expoting the trained model**

In [21]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(grid_search,f)