In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path

# Set plot style
sns.set(style="whitegrid")

# Define data path
DATA_DIR = Path("../../data")

## 1. Load Data

In [None]:
app_train = pd.read_csv(DATA_DIR / "application_train.csv")
print(f"Training data shape: {app_train.shape}")
app_train.head()

## 2. Target Distribution
Check the balance of the target variable (0: Repaid, 1: Default).

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='TARGET', data=app_train)
plt.title('Target Variable Distribution')
plt.show()

print(app_train['TARGET'].value_counts(normalize=True))

## 3. Missing Values
Analyze columns with high percentage of missing values.

In [None]:
missing = app_train.isnull().sum().sort_values(ascending=False)
missing_percent = (app_train.isnull().sum() / app_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([missing, missing_percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

## 4. Key Features Analysis
Analyzing some known important features like `EXT_SOURCE_x` and `DAYS_BIRTH`.

In [None]:
plt.figure(figsize=(10, 8))
ext_sources = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'TARGET']]
ext_sources_corr = ext_sources.corr()
sns.heatmap(ext_sources_corr, annot=True, cmap='coolwarm')
plt.title('Correlation of EXT_SOURCE features with TARGET')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'DAYS_BIRTH'] / -365, label='Target == 0')
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'DAYS_BIRTH'] / -365, label='Target == 1')
plt.xlabel('Age (years)')
plt.ylabel('Density')
plt.title('Distribution of Ages')
plt.legend()
plt.show()