# 🧠 Loan Default Classification Project

## Step 1: Load and Explore Data

In [1]:

import pandas as pd

# Load dataset
df = pd.read_csv('loan_data.csv')

# Explore the dataset
print(df.head())
print(df.info())
print(df.describe())
print(df['TARGET'].value_counts(normalize=True))  # Check class imbalance


   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0      100002       1         Cash loans           M            N   
1      100003       0         Cash loans           F            N   
2      100004       0    Revolving loans           M            Y   
3      100006       0         Cash loans           F            N   
4      100007       0         Cash loans           M            N   

  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0               Y             0          202500.0    406597.5      24700.5   
1               N             0          270000.0   1293502.5      35698.5   
2               Y             0           67500.0    135000.0       6750.0   
3               Y             0          135000.0    312682.5      29686.5   
4               Y             0          121500.0    513000.0      21865.5   

   AMT_GOODS_PRICE  
0         351000.0  
1        1129500.0  
2         135000.0  
3         297000.0  
4         5

## Step 2: Clean and Encode Data

In [2]:

# Example cleaning: drop irrelevant columns
# df = df.drop(['ID', 'Name'], axis=1)  # Uncomment and customize as needed

# Handle categorical variables
df = pd.get_dummies(df, drop_first=True)

# Fill or drop missing values
df = df.dropna()

# Separate features and target
X = df.drop("TARGET", axis=1)
y = df["TARGET"]


## Step 3: Train-Test Split

In [3]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Step 4: Logistic Regression with p-values

In [10]:
import pandas as pd
import statsmodels.api as sm

# Force X and y to numeric, drop any non-numeric columns and rows with NaNs
X_train_cleaned = X_train.copy().apply(pd.to_numeric, errors='coerce')
y_train_cleaned = pd.to_numeric(y_train, errors='coerce')

# Drop columns in X with any NaNs (likely from failed coercion)
X_train_cleaned = X_train_cleaned.dropna(axis=1)

# Drop rows with NaNs in X or y
combined = pd.concat([X_train_cleaned, y_train_cleaned], axis=1).dropna()

# Separate cleaned X and y again
X_final = combined.drop(columns=['TARGET']) if 'TARGET' in combined.columns else combined.iloc[:, :-1]
y_final = combined['TARGET'] if 'TARGET' in combined.columns else combined.iloc[:, -1]

# Add constant for intercept
X_final_sm = sm.add_constant(X_final)

# Fit logistic regression
logit_model = sm.Logit(y_final, X_final_sm)
result = logit_model.fit()

# Show model summary
print(result.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

## Step 5: Fit Additional Models (Optional)

In [None]:

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)


## Step 6: Evaluate Performance

In [None]:

from sklearn.metrics import accuracy_score, classification_report

y_pred = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Baseline comparison
baseline = max(y.value_counts()) / len(y)
print("Baseline Accuracy (Majority Class):", baseline)


## Step 7: Summary and Analysis


Write a short analysis here covering:

- Preprocessing steps you applied
- Insights from logistic regression p-values
- Model performance and how it compares to the baseline
- Which features seem most/least useful
- Any other insights you gained
