In [1]:
import sys
import os
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(os.path.abspath('..'))

from src.data_loader import load_data
from src.config import (
    RANDOM_SEED,
    FIGURES_DIR,
    RESULTS_DIR,
    NUMERICAL_FEATURES,
    CATEGORICAL_FEATURES,
    ORDINAL_FEATURES,
    TARGET_FEATURE,
    ALL_FEATURES
)

np.random.seed(RANDOM_SEED)

sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 300

In [2]:
train_df, test_df = load_data()

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nTrain columns: {list(train_df.columns)}")

Loading training data from /Users/kryspin/personal/playground/recruitment/challengING_DS/ing_task/data/train.parquet
Loading test data from /Users/kryspin/personal/playground/recruitment/challengING_DS/ing_task/data/test.parquet
Train shape: (20000, 11)
Test shape: (5000, 11)
Train shape: (20000, 11)
Test shape: (5000, 11)

Train columns: ['Age', 'Income', 'CreditScore', 'LoanAmount', 'EmploymentYears', 'NumDependents', 'DebtToIncome', 'EducationLevel', 'FavoriteColor', 'Hobby', 'Default']


In [3]:
print("Negative values BEFORE fix:")
print(f"Income < 0: {(train_df['Income'] < 0).sum()} records")
print(f"LoanAmount < 0: {(train_df['LoanAmount'] < 0).sum()} records")

# Show examples
if (train_df['Income'] < 0).sum() > 0:
    print("\nExample negative values:")
    negative_records = train_df[(train_df['Income'] < 0) | (train_df['LoanAmount'] < 0)]
    print(negative_records[['Income', 'LoanAmount', 'Default']].head())

Negative values BEFORE fix:
Income < 0: 11 records
LoanAmount < 0: 2 records

Example negative values:
            Income    LoanAmount  Default
2286  -4528.003413  24682.121752        0
3477  19346.924692   -788.669387        1
5338    -38.523242  13657.715739        0
6798  -7845.629939  19616.180081        1
6976    -23.390659  16819.988884        0


In [5]:
def fix_negative_values(df):
    """
    Fix negative Income and LoanAmount values.
    
    Based on Phase 1 EDA: 13 records had accidental negative signs.
    Magnitudes are plausible when converted to positive.
    """
    df = df.copy()
    df['Income'] = df['Income'].abs()
    df['LoanAmount'] = df['LoanAmount'].abs()
    return df

# Apply fix to both train and test
train_df = fix_negative_values(train_df)
test_df = fix_negative_values(test_df)

print("Quality fixes applied")

Quality fixes applied


In [6]:
print("Negative values AFTER fix:")
print(f"Income < 0: {(train_df['Income'] < 0).sum()} records")
print(f"LoanAmount < 0: {(train_df['LoanAmount'] < 0).sum()} records")

# Verify test set too
print(f"\nTest set - Income < 0: {(test_df['Income'] < 0).sum()} records")
print(f"Test set - LoanAmount < 0: {(test_df['LoanAmount'] < 0).sum()} records")

assert (train_df['Income'] < 0).sum() == 0, "Still have negative Income values!"
assert (train_df['LoanAmount'] < 0).sum() == 0, "Still have negative LoanAmount values!"
assert (test_df['Income'] < 0).sum() == 0, "Test set has negative Income values!"
assert (test_df['LoanAmount'] < 0).sum() == 0, "Test set has negative LoanAmount values!"

print("\nAll assertions passed - no negative values remain")

Negative values AFTER fix:
Income < 0: 0 records
LoanAmount < 0: 0 records

Test set - Income < 0: 0 records
Test set - LoanAmount < 0: 0 records

All assertions passed - no negative values remain


In [7]:
X_train = train_df.drop(columns=[TARGET_FEATURE])
y_train = train_df[TARGET_FEATURE]

X_test = test_df.drop(columns=[TARGET_FEATURE])
y_test = test_df[TARGET_FEATURE]

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"\nX_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

print(f"\ny_train distribution:")
print(y_train.value_counts(normalize=True))

X_train shape: (20000, 10)
y_train shape: (20000,)

X_test shape: (5000, 10)
y_test shape: (5000,)

y_train distribution:
Default
0    0.65685
1    0.34315
Name: proportion, dtype: float64


In [10]:
print("Missing values in X_train:")
missing_train = X_train.isnull().sum()
print(missing_train[missing_train > 0])

# CredutScore sum
cs_sum = X_train['CreditScore'].isnull().sum()

print(f"\nCreditScore missing: {cs_sum} / {len(X_train)} ({cs_sum/len(X_train)*100:.2f}%)")

Missing values in X_train:
CreditScore    2383
dtype: int64

CreditScore missing: 2383 / 20000 (11.92%)
