In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

test_df = pd.read_csv('test_updated.csv')
train_df = pd.read_csv('train_updated.csv')

target = train_df['RiskFlag']
train_ids = train_df['ProfileID']
test_ids = test_df['ProfileID']

train_features = train_df.drop(columns=['RiskFlag', 'ProfileID'])
test_features = test_df.drop(columns=['ProfileID'])

combined_df = pd.concat([train_features, test_features], axis=0)

# Feature Engineering
combined_df['Loan_to_Income'] = combined_df['RequestedSum'] / combined_df['AnnualEarnings']
combined_df['EMI_Approx'] = combined_df['RequestedSum'] / combined_df['RepayPeriod']
combined_df['EMI_to_Income'] = combined_df['EMI_Approx'] / (combined_df['AnnualEarnings'] / 12)

for col in ['AnnualEarnings', 'RequestedSum']:
    combined_df[col] = np.log1p(combined_df[col])

# Encoding 
education_map = {"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3}
combined_df['QualificationLevel'] = combined_df['QualificationLevel'].map(education_map)

binary_map = {'Yes': 1, 'No': 0}
for col in ['OwnsProperty', 'FamilyObligation', 'JointApplicant']:
    combined_df[col] = combined_df[col].map(binary_map)

combined_df = pd.get_dummies(combined_df, columns=['WorkCategory', 'RelationshipStatus', 'FundUseCase'], drop_first=True)

numeric_cols = combined_df.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
combined_df[numeric_cols] = scaler.fit_transform(combined_df[numeric_cols])

X = combined_df[:len(train_df)]
X_test_submission = combined_df[len(train_df):]
y = target

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    n_jobs=-1,
    random_state=42,
    class_weight='balanced_subsample'
)

rf.fit(X_train, y_train)
y_val_pred = rf.predict(X_val)

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

test_pred = rf.predict(X_test_submission)

rf_submission = pd.DataFrame({
    'ProfileID': test_ids,
    'RiskFlag': test_pred
})

rf_submission.to_csv('rf_submission.csv', index=False)
print("Saved rf_submission.csv")

Confusion Matrix:
[[35749   356]
 [ 4360   391]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     36105
           1       0.52      0.08      0.14      4751

    accuracy                           0.88     40856
   macro avg       0.71      0.54      0.54     40856
weighted avg       0.85      0.88      0.85     40856

Saved rf_submission.csv
