In [None]:

# 1. Setup Personal Details
personal_details = {
    "Name": "Your Name",
    "ID": "Your ID",
    "Class/Batch": "Your Class/Batch"
}
print(personal_details)


In [None]:

# 2. Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder


In [None]:

# 3. Data Loading
# Load the dataset (replace 'loan_prediction_dataset.csv' with the actual path of your dataset)
df = pd.read_csv('loan_prediction_dataset.csv')

# 4. Data Preprocessing
# Initial data exploration
print(df.shape)
print(df.head())
print(df.info())
print(df.describe())

# Drop irrelevant columns (e.g., 'Loan_ID', assuming it's irrelevant)
df = df.drop(columns=['Loan_ID'])

# Handle missing values (Fill with median or mode based on column type)
df.fillna(df.median(), inplace=True)

# Handle categorical data (using LabelEncoder for simplicity)
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Married'] = label_encoder.fit_transform(df['Married'])
df['Education'] = label_encoder.fit_transform(df['Education'])
df['Self_Employed'] = label_encoder.fit_transform(df['Self_Employed'])
df['Property_Area'] = label_encoder.fit_transform(df['Property_Area'])
df['Loan_Status'] = label_encoder.fit_transform(df['Loan_Status'])


In [None]:

# 5. Feature Selection
# Select relevant features (assuming 'Loan_Status' is the target variable)
X = df.drop(columns=['SF_Price'])  # Replace 'SF_Price' with actual target variable
y = df['SF_Price']


In [None]:

# 6. Model Training
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
# 6.1. Linear Regression
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

# 6.2. Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# 6.3. Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# 6.4. KNN Regressor (for k = 3, 4, 5)
knn_model_3 = KNeighborsRegressor(n_neighbors=3)
knn_model_3.fit(X_train, y_train)

knn_model_4 = KNeighborsRegressor(n_neighbors=4)
knn_model_4.fit(X_train, y_train)

knn_model_5 = KNeighborsRegressor(n_neighbors=5)
knn_model_5.fit(X_train, y_train)


In [None]:

# 7. Model Evaluation
# Predict on test data
y_pred_lr = linear_reg_model.predict(X_test)
y_pred_dt = dt_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_knn_3 = knn_model_3.predict(X_test)
y_pred_knn_4 = knn_model_4.predict(X_test)
y_pred_knn_5 = knn_model_5.predict(X_test)

# Calculate evaluation metrics
models = ['Linear Regression', 'Decision Tree', 'Random Forest', 'KNN (k=3)', 'KNN (k=4)', 'KNN (k=5)']
mae = [mean_absolute_error(y_test, y_pred_lr), mean_absolute_error(y_test, y_pred_dt), 
       mean_absolute_error(y_test, y_pred_rf), mean_absolute_error(y_test, y_pred_knn_3), 
       mean_absolute_error(y_test, y_pred_knn_4), mean_absolute_error(y_test, y_pred_knn_5)]

mse = [mean_squared_error(y_test, y_pred_lr), mean_squared_error(y_test, y_pred_dt), 
       mean_squared_error(y_test, y_pred_rf), mean_squared_error(y_test, y_pred_knn_3), 
       mean_squared_error(y_test, y_pred_knn_4), mean_squared_error(y_test, y_pred_knn_5)]

rmse = [np.sqrt(i) for i in mse]

r2 = [r2_score(y_test, y_pred_lr), r2_score(y_test, y_pred_dt), r2_score(y_test, y_pred_rf), 
      r2_score(y_test, y_pred_knn_3), r2_score(y_test, y_pred_knn_4), r2_score(y_test, y_pred_knn_5)]

# Display the metrics
metrics_df = pd.DataFrame({
    'Model': models,
    'MAE': mae,
    'MSE': mse,
    'RMSE': rmse,
    'R2': r2
})

print(metrics_df)


In [None]:

# 8. Visualization
# Create bar plot for model performance comparison
fig, axes = plt.subplots(1, 4, figsize=(20, 5))

# MAE
sns.barplot(x='MAE', y='Model', data=metrics_df, ax=axes[0])
axes[0].set_title('Mean Absolute Error')

# MSE
sns.barplot(x='MSE', y='Model', data=metrics_df, ax=axes[1])
axes[1].set_title('Mean Squared Error')

# RMSE
sns.barplot(x='RMSE', y='Model', data=metrics_df, ax=axes[2])
axes[2].set_title('Root Mean Squared Error')

# R2
sns.barplot(x='R2', y='Model', data=metrics_df, ax=axes[3])
axes[3].set_title('R-squared')

plt.tight_layout()
plt.show()


In [None]:

# 9. Conclusion
# Briefly discuss which model performed best
best_model = metrics_df.loc[metrics_df['R2'].idxmax()]
print(f"Best performing model: {best_model['Model']} with R²: {best_model['R2']}")
