In [1]:
# Unzip the dataset in Colab
import zipfile
import os

zip_path = "/content/GDG TEST DATASETS-20250913T123913Z-1-001.zip"   # update if filename differs
extract_path = "/content/GDG TEST DATASETS"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Check extracted files
os.listdir(extract_path)

['GDG TEST DATASETS']

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [4]:
import os

for root, dirs, files in os.walk("/content"):
    for f in files:
        if "loan_train" in f:
            print(os.path.join(root, f))

/content/GDG TEST DATASETS/GDG TEST DATASETS/loan_train.xlsx


In [5]:
train = pd.read_excel("/content/GDG TEST DATASETS/GDG TEST DATASETS/loan_train.xlsx")
test = pd.read_excel("/content/GDG TEST DATASETS/GDG TEST DATASETS/loan_test.xlsx")

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

train = pd.read_excel("/content/GDG TEST DATASETS/GDG TEST DATASETS/loan_train.xlsx")
test = pd.read_excel("/content/GDG TEST DATASETS/GDG TEST DATASETS/loan_test.xlsx")


train['gender'] = train['gender'].fillna('Unknown')
test['gender'] = test['gender'].fillna('Unknown')


train['debt_to_income'] = train['loan_amount'] / train['annual_income']
test['debt_to_income'] = test['loan_amount'] / test['annual_income']

def bucket_credit(x):
    if x < 500:
        return "Low"
    elif x < 650:
        return "Medium"
    else:
        return "High"

train['credit_bucket'] = train['credit_score'].apply(bucket_credit)
test['credit_bucket'] = test['credit_score'].apply(bucket_credit)


cat_cols = ['gender', 'home_ownership', 'purpose', 'credit_bucket']
train = pd.get_dummies(train, columns=cat_cols, drop_first=True)
test = pd.get_dummies(test, columns=cat_cols, drop_first=True)


X = train.drop(columns=['id', 'target'])
y = train['target']
X_test_final = test.drop(columns=['id'])
X, X_test_final = X.align(X_test_final, join='left', axis=1, fill_value=0)


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

best_rf = grid.best_estimator_
print("âœ… Best Parameters:", grid.best_params_)


y_val_pred = best_rf.predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)
print("ðŸŽ¯ Validation Accuracy:", val_acc)


final_preds = best_rf.predict(X_test_scaled)


submission = pd.DataFrame({
    "id": test['id'],
    "target": final_preds
})

submission.to_csv("Divyashree Patil_final_submission.csv", index=False)
print("ðŸ“‚ Submission saved: YourName_final_submission.csv")
submission.head()

Fitting 3 folds for each of 96 candidates, totalling 288 fits
âœ… Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}
ðŸŽ¯ Validation Accuracy: 0.6583333333333333
ðŸ“‚ Submission saved: YourName_final_submission.csv


Unnamed: 0,id,target
0,6001,0
1,6002,1
2,6003,0
3,6004,0
4,6005,1
