In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

test_df = pd.read_csv('test_updated.csv')
train_df = pd.read_csv('train_updated.csv')

target = train_df['RiskFlag']
train_ids = train_df['ProfileID']
test_ids = test_df['ProfileID']

train_features = train_df.drop(columns=['RiskFlag', 'ProfileID'])
test_features = test_df.drop(columns=['ProfileID'])

combined_df = pd.concat([train_features, test_features], axis=0)

# Feature Engineering
combined_df['Loan_to_Income'] = combined_df['RequestedSum'] / combined_df['AnnualEarnings']
combined_df['EMI_Approx'] = combined_df['RequestedSum'] / combined_df['RepayPeriod']
combined_df['EMI_to_Income'] = combined_df['EMI_Approx'] / (combined_df['AnnualEarnings'] / 12)

for col in ['AnnualEarnings', 'RequestedSum']:
    combined_df[col] = np.log1p(combined_df[col])

# Encoding 
education_map = {"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3}
combined_df['QualificationLevel'] = combined_df['QualificationLevel'].map(education_map)

binary_map = {'Yes': 1, 'No': 0}
for col in ['OwnsProperty', 'FamilyObligation', 'JointApplicant']:
    combined_df[col] = combined_df[col].map(binary_map)

combined_df = pd.get_dummies(combined_df, columns=['WorkCategory', 'RelationshipStatus', 'FundUseCase'], drop_first=True)

numeric_cols = combined_df.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
combined_df[numeric_cols] = scaler.fit_transform(combined_df[numeric_cols])

X = combined_df[:len(train_df)]
X_test_submission = combined_df[len(train_df):]
y = target

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.utils import resample

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.8, random_state=42, stratify=y)
train_data = pd.concat([X_train, y_train], axis=1)
majority = train_data[train_data.RiskFlag == 0]
minority = train_data[train_data.RiskFlag == 1]

minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
train_upsampled = pd.concat([majority, minority_upsampled])

X_train_up = train_upsampled.drop('RiskFlag', axis=1)
y_train_up = train_upsampled.RiskFlag

mlp = MLPClassifier(hidden_layer_sizes=(128, 64, 32), 
                    activation='relu', 
                    solver='adam', 
                    alpha=0.01,                 
                    batch_size=128,             
                    learning_rate='adaptive',   
                    max_iter=300, 
                    early_stopping=True,
                    random_state=42)

mlp.fit(X_train_up, y_train_up)

y_proba_val = mlp.predict_proba(X_val)[:, 1]

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 1.0, 0.05):
    preds = (y_proba_val >= t).astype(int)
    score = f1_score(y_val, preds)
    if score > best_f1:
        best_f1 = score
        best_threshold = t

print(f"Best Threshold Found: {best_threshold}")
print(f"Best F1 Score: {best_f1}")

y_pred_optimized = (y_proba_val >= best_threshold).astype(int)
print("Optimized Classification Report")
print(classification_report(y_val, y_pred_optimized))

test_probs = mlp.predict_proba(X_test_submission)[:, 1]
test_predictions = (test_probs >= best_threshold).astype(int)

submission = pd.DataFrame({'ProfileID': test_ids, 'RiskFlag': test_predictions})
submission.to_csv('nn_submission.csv', index=False)

Best Threshold Found: 0.1
Best F1 Score: 0.2530605702378329
Optimized Classification Report
              precision    recall  f1-score   support

           0       0.90      0.83      0.87    144420
           1       0.20      0.33      0.25     19002

    accuracy                           0.77    163422
   macro avg       0.55      0.58      0.56    163422
weighted avg       0.82      0.77      0.79    163422



In [4]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.8, random_state=42, stratify=y)

svm_model = SVC(
    kernel='rbf',
    C=10,
    gamma=0.01,
    class_weight='balanced',  
    random_state=42,
    probability=True
)

svm_model.fit(X_train, y_train)

val_preds = svm_model.predict(X_val)
val_probs = svm_model.predict_proba(X_val)[:, 1]

print("Confusion Matrix:\n", confusion_matrix(y_val, val_preds))
print("\nClassification Report:\n", classification_report(y_val, val_preds))

test_preds = svm_model.predict(X_test_submission)

submission = pd.DataFrame({
    'ProfileID': test_ids,
    'RiskFlag': test_preds
})

submission.to_csv('svm_submission.csv', index=False)
print("\nSubmission saved!")

Confusion Matrix:
 [[103339  41081]
 [  6687  12315]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.72      0.81    144420
           1       0.23      0.65      0.34     19002

    accuracy                           0.71    163422
   macro avg       0.58      0.68      0.58    163422
weighted avg       0.86      0.71      0.76    163422


Submission saved!
