In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

test_df = pd.read_csv('test_updated.csv')
train_df = pd.read_csv('train_updated.csv')

target = train_df['RiskFlag']
train_ids = train_df['ProfileID']
test_ids = test_df['ProfileID']

train_features = train_df.drop(columns=['RiskFlag', 'ProfileID'])
test_features = test_df.drop(columns=['ProfileID'])

combined_df = pd.concat([train_features, test_features], axis=0)

# Encoding 
education_map = {"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3}
combined_df['QualificationLevel'] = combined_df['QualificationLevel'].map(education_map)

binary_map = {'Yes': 1, 'No': 0}
for col in ['OwnsProperty', 'FamilyObligation', 'JointApplicant']:
    combined_df[col] = combined_df[col].map(binary_map)


# Feature Engineering

# Financial ratios
combined_df['Loan_to_Income'] = combined_df['RequestedSum'] / combined_df['AnnualEarnings']
combined_df['EMI_Approx'] = combined_df['RequestedSum'] / combined_df['RepayPeriod']
combined_df['EMI_to_Income'] = combined_df['EMI_Approx'] / (combined_df['AnnualEarnings'] / 12)
combined_df['Debt_to_Loan'] = combined_df['DebtFactor'] / (combined_df['RequestedSum'] + 1e-6)
combined_df['Earnings_per_Account'] = combined_df['AnnualEarnings'] / (combined_df['ActiveAccounts'] + 1e-6)
combined_df['MonthlyIncome'] = combined_df['AnnualEarnings'] / 12
combined_df['Repayment_to_Earnings'] = combined_df['RequestedSum'] / (combined_df['AnnualEarnings'] * combined_df['RepayPeriod'] / 12 + 1e-6)
combined_df['Margin_after_Debt'] = combined_df['MonthlyIncome'] - (combined_df['DebtFactor'] * combined_df['MonthlyIncome'])

# Interaction features
combined_df['Trust_Qualification'] = combined_df['TrustMetric'] * combined_df['QualificationLevel']
combined_df['WorkDuration_Qualification'] = combined_df['WorkDuration'] * combined_df['QualificationLevel']

# Temporal and count features
combined_df['Log_WorkDuration'] = np.log1p(combined_df['WorkDuration'])
combined_df['Log_ActiveAccounts'] = np.log1p(combined_df['ActiveAccounts'])
combined_df['WorkDuration_Bin'] = pd.cut(combined_df['WorkDuration'], bins=[-np.inf, 12, 60, np.inf], labels=[0,1,2])

# Age group bins: 18-30 young, 31-50 mid, 50+ senior
combined_df['Age_Group'] = pd.cut(combined_df['ApplicantYears'], bins=[0,30,50,np.inf], labels=[0,1,2])

# Property and JointApplicant interaction
combined_df['Property_JointApplicant'] = combined_df['OwnsProperty'] * combined_df['JointApplicant']

combined_df = pd.get_dummies(combined_df, columns=['WorkDuration_Bin', 'Age_Group'], drop_first=True)

for col in ['AnnualEarnings', 'RequestedSum']:
    combined_df[col] = np.log1p(combined_df[col])


combined_df = pd.get_dummies(combined_df, columns=['WorkCategory', 'RelationshipStatus', 'FundUseCase'], drop_first=True)

# Scaling
numeric_cols = combined_df.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
combined_df[numeric_cols] = scaler.fit_transform(combined_df[numeric_cols])

# Polynomial Features 
# poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
# combined_poly = poly.fit_transform(combined_df)

X = combined_df[:len(train_df)]
X_test_submission = combined_df[len(train_df):]
y = target

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

log_reg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

y_proba = log_reg.predict_proba(X_val)[:, 1]

#0.5 with basic pre-processing gave bad score on test data
#0.65 gave 0.82
#0.8 gave 0.881
#0.85 gave 0.886
threshold = 0.9  # Tuned threshold
y_pred = (y_proba >= threshold).astype(int)

print(classification_report(y_val, y_pred))

test_probs = log_reg.predict_proba(X_test_submission)[:, 1]
test_predictions = (test_probs >= threshold).astype(int)

submission = pd.DataFrame({'ProfileID': test_ids, 'RiskFlag': test_predictions})
submission.to_csv('submission.csv', index=False)

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     36105
           1       0.64      0.06      0.11      4751

    accuracy                           0.89     40856
   macro avg       0.76      0.53      0.52     40856
weighted avg       0.86      0.89      0.84     40856



In [8]:
#Checking accuracy on different thresholds
thresholds = [0.5, 0.6, 0.7, 0.75, 0.80, 0.85, 0.90, 0.92, 0.95]
for thr in thresholds:
    y_val_pred = (y_proba >= thr).astype(int)
    print(f"\nThreshold: {thr}")
    print(classification_report(y_val, y_val_pred))


Threshold: 0.5
              precision    recall  f1-score   support

           0       0.95      0.69      0.80     36105
           1       0.23      0.70      0.34      4751

    accuracy                           0.69     40856
   macro avg       0.59      0.69      0.57     40856
weighted avg       0.86      0.69      0.74     40856


Threshold: 0.6
              precision    recall  f1-score   support

           0       0.93      0.82      0.87     36105
           1       0.28      0.54      0.37      4751

    accuracy                           0.79     40856
   macro avg       0.61      0.68      0.62     40856
weighted avg       0.86      0.79      0.81     40856


Threshold: 0.7
              precision    recall  f1-score   support

           0       0.91      0.91      0.91     36105
           1       0.35      0.36      0.35      4751

    accuracy                           0.85     40856
   macro avg       0.63      0.63      0.63     40856
weighted avg       0.85   

In [9]:
import joblib
joblib.dump(log_reg, "logistic_regression_model_nb.pkl")

['logistic_regression_model_nb.pkl']

In [10]:
import joblib
from sklearn.metrics import accuracy_score, classification_report

model = joblib.load("logistic_regression_model_nb.pkl") 

y_proba = model.predict_proba(X_val)[:, 1]
y_pred = (y_proba >= threshold).astype(int)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

Accuracy: 0.8866996279616213

Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94     36105
           1       0.64      0.06      0.11      4751

    accuracy                           0.89     40856
   macro avg       0.76      0.53      0.52     40856
weighted avg       0.86      0.89      0.84     40856

