Step 1: Load and Explore the Data
Objective: Load the datasets, check their structure, and get an initial understanding.

In [None]:
import pandas as pd

# Load datasets
application_data = pd.read_csv('application_data.csv')
previous_application = pd.read_csv('previous_application.csv')
try:
    columns_description = pd.read_csv('columns_description.csv', encoding='utf-8')
except UnicodeDecodeError:
    try:
        columns_description = pd.read_csv('columns_description.csv', encoding='latin1')
    except UnicodeDecodeError:
        columns_description = pd.read_csv('columns_description.csv', encoding='iso-8859-1')

# Check the structure of the datasets
print(application_data.info())
print(previous_application.info())

# Display first few rows
print(application_data.head())
print(previous_application.head())


Step 2: Data Cleaning (Handling Missing Values & Data Types)
Objective: Identify and handle missing values, fix data types, and drop irrelevant columns.

In [None]:
# Check for missing values
missing_values = application_data.isnull().sum().sort_values(ascending=False)
print(missing_values[missing_values > 0])

# Drop columns with excessive missing values (e.g., more than 50% missing)
threshold = 0.5 * len(application_data)
application_data_cleaned = application_data.dropna(thresh=threshold, axis=1)

# Fill missing values with median for numerical columns
num_cols = application_data_cleaned.select_dtypes(include=['number']).columns
application_data_cleaned[num_cols] = application_data_cleaned[num_cols].fillna(application_data_cleaned[num_cols].median())

# Convert categorical variables
cat_cols = application_data_cleaned.select_dtypes(include=['object']).columns
application_data_cleaned[cat_cols] = application_data_cleaned[cat_cols].fillna("Unknown")

# Display cleaned data
print(application_data_cleaned.info())


Step 3: Handle Categorical Variables (Encoding)
Objective: Convert categorical variables into numerical values for analysis.

In [None]:
from sklearn.preprocessing import LabelEncoder

# Label Encoding for binary categorical variables
for col in cat_cols:
    if application_data_cleaned[col].nunique() == 2:  # Binary categorical
        le = LabelEncoder()
        application_data_cleaned[col] = le.fit_transform(application_data_cleaned[col])
    else:
        application_data_cleaned = pd.get_dummies(application_data_cleaned, columns=[col], drop_first=True)

print(application_data_cleaned.head())


Step 4: Identify Outliers
Objective: Detect outliers in numerical columns.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot distribution of a key numerical feature
plt.figure(figsize=(10, 5))
sns.boxplot(x=application_data_cleaned['AMT_INCOME_TOTAL'])
plt.title("Box Plot of Income")
plt.show()


Step 5: Data Imbalance Check
Objective: Check if default cases (TARGET variable) are imbalanced.

In [None]:
sns.countplot(x='TARGET', data=application_data_cleaned)
plt.title("Target Variable Distribution")
plt.show()

# Calculate imbalance ratio
default_ratio = application_data_cleaned['TARGET'].value_counts(normalize=True)
print("Class Distribution:\n", default_ratio)


Step 6: Univariate & Bivariate Analysis
Objective: Analyze the relationship of features with loan default.

In [None]:
# Histogram of Income by Default Status
plt.figure(figsize=(10, 5))
sns.histplot(application_data_cleaned, x="AMT_INCOME_TOTAL", hue="TARGET", kde=True)
plt.title("Income Distribution by Loan Default")
plt.show()


Step 7: Correlation Analysis
Objective: Identify top correlated features affecting default.

In [None]:
import numpy as np

# Convert all non-numeric columns to numeric before correlation
application_data_numeric = application_data_cleaned.select_dtypes(include=[np.number])

# Compute correlation matrix
correlation_matrix = application_data_numeric.corr()

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()

# Top 10 correlations with default (TARGET)
top_corr = correlation_matrix['TARGET'].abs().sort_values(ascending=False)
print("Top Correlated Features with Default:\n", top_corr.head(10))


Step 8: Feature Selection
Objective: Select the most important variables for analysis.

In [None]:
# Select top features correlated with TARGET
selected_features = top_corr.head(10).index
application_data_selected = application_data_cleaned[selected_features]

print(application_data_selected.head())


Step 9: Summarizing Insights
Objective: Summarize key findings and their business impact.

Key Questions to Answer:
What are the strongest predictors of loan default?
How does income level affect default risk?
Do certain loan types have a higher default rate?
Are previous loan rejections linked to higher default rates?
Are married individuals less likely to default than single individuals?