In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
# Load the data
train_df = pd.read_csv('Training Dataset.csv')
test_df = pd.read_csv('Test Dataset.csv')
print("Training Dataset: ")
print(train_df.head())
print("Test Dataset: ")
print(test_df.head())

Training Dataset: 
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2

In [3]:
# Preprocessing
def preprocess_data(train_df, test_df):
    train_y = train_df['Loan_Status']
    train_X = train_df.drop(['Loan_ID', 'Loan_Status'], axis=1)
    test_X = test_df.drop(['Loan_ID'], axis=1)

    # Categorical and numerical columns
    cat_cols = train_X.select_dtypes(include=['object']).columns
    num_cols = train_X.select_dtypes(exclude=['object']).columns

    # ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(strategy='median'), num_cols),
            ('cat', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore'))
            ]), cat_cols)
        ]
    )

    # Fiting the preprocessor on the training data and transform both train and test data
    train_X_preprocessed = preprocessor.fit_transform(train_X)
    test_X_preprocessed = preprocessor.transform(test_X)

    # Encoding the target variable
    le = LabelEncoder()
    train_y = le.fit_transform(train_y)

    return train_X_preprocessed, train_y, test_X_preprocessed, preprocessor, le

train_X, train_y, test_X, preprocessor, label_encoder = preprocess_data(train_df, test_df)

In [4]:
# Standardize features
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

#### Clustering

In [6]:
# K-Means Clustering
kmeans = KMeans(n_clusters=2, random_state=0)
train_df['KMeans_Cluster'] = kmeans.fit_predict(train_X_scaled)



In [7]:
# Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=2)
train_df['Hierarchical_Cluster'] = hierarchical.fit_predict(train_X_scaled)

In [8]:
# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
train_df['DBSCAN_Cluster'] = dbscan.fit_predict(train_X_scaled)

In [9]:
# Gaussian Mixture Models
gmm = GaussianMixture(n_components=2, random_state=0)
train_df['GMM_Cluster'] = gmm.fit_predict(train_X_scaled)



#### Ensemble Methods

In [11]:
# Bagging
bagging = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=0)
bagging.fit(train_X_scaled, train_y)
y_pred_bagging = bagging.predict(test_X_scaled)
print("Bagging Accuracy:", accuracy_score(train_y, bagging.predict(train_X_scaled)))

Bagging Accuracy: 0.998371335504886


In [12]:
# Boosting
boosting = GradientBoostingClassifier(n_estimators=50, random_state=0)
boosting.fit(train_X_scaled, train_y)
y_pred_boosting = boosting.predict(test_X_scaled)
print("Boosting Accuracy:", accuracy_score(train_y, boosting.predict(train_X_scaled)))

Boosting Accuracy: 0.8387622149837134


In [13]:
# Stacking
estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=0)),
    ('gb', GradientBoostingClassifier(n_estimators=10, random_state=0))
]
stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking.fit(train_X_scaled, train_y)
y_pred_stacking = stacking.predict(test_X_scaled)
print("Stacking Accuracy:", accuracy_score(train_y, stacking.predict(train_X_scaled)))

Stacking Accuracy: 0.9022801302931596


In [14]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(train_X_scaled, train_y)
y_pred_rf = rf.predict(test_X_scaled)
print("Random Forest Accuracy:", accuracy_score(train_y, rf.predict(train_X_scaled)))

Random Forest Accuracy: 1.0


In [15]:
# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, random_state=0)
gb.fit(train_X_scaled, train_y)
y_pred_gb = gb.predict(test_X_scaled)
print("Gradient Boosting Accuracy:", accuracy_score(train_y, gb.predict(train_X_scaled)))

Gradient Boosting Accuracy: 0.8729641693811075


#### Regularization

In [17]:
# L1 Regularization (Lasso)
lasso = LogisticRegression(penalty='l1', solver='saga', max_iter=5000)
lasso.fit(train_X_scaled, train_y)
y_pred_lasso = lasso.predict(test_X_scaled)
print("Lasso Accuracy:", accuracy_score(train_y, lasso.predict(train_X_scaled)))

Lasso Accuracy: 0.8127035830618893


In [18]:
# L2 Regularization (Ridge)
ridge = LogisticRegression(penalty='l2', solver='saga', max_iter=5000)
ridge.fit(train_X_scaled, train_y)
y_pred_ridge = ridge.predict(test_X_scaled)
print("Ridge Accuracy:", accuracy_score(train_y, ridge.predict(train_X_scaled)))

Ridge Accuracy: 0.8127035830618893


In [19]:
# ElasticNet Regularization
elasticnet = LogisticRegression(penalty='elasticnet', l1_ratio=0.5, solver='saga', max_iter=5000)
elasticnet.fit(train_X_scaled, train_y)
y_pred_elasticnet = elasticnet.predict(test_X_scaled)
print("ElasticNet Accuracy:", accuracy_score(train_y, elasticnet.predict(train_X_scaled)))

ElasticNet Accuracy: 0.8127035830618893
