In [1]:
import sys
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
sys.path.append(os.path.abspath('..'))
from src.data_loader import load_data
from src.config import (
    RANDOM_SEED, FIGURES_DIR, RESULTS_DIR,
    NUMERICAL_FEATURES, CATEGORICAL_FEATURES, ORDINAL_FEATURES,
    TARGET_FEATURE, ALL_FEATURES
)

In [7]:
np.random.seed(RANDOM_SEED)

# vis config
sns.set_style('whitegrid')
sns.set_context('notebook', font_scale=1.1)
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.figsize'] = (10, 6)

In [4]:
train_df, test_df = load_data()

Loading training data from /Users/kryspin/personal/playground/recruitment/challengING_DS/ing_task/data/train.parquet
Loading test data from /Users/kryspin/personal/playground/recruitment/challengING_DS/ing_task/data/test.parquet
Train shape: (20000, 11)
Test shape: (5000, 11)


In [5]:
display(train_df.head())

Unnamed: 0,Age,Income,CreditScore,LoanAmount,EmploymentYears,NumDependents,DebtToIncome,EducationLevel,FavoriteColor,Hobby,Default
0,59,16785.977212,604.0,19416.230574,3,2,0.912395,3,Green,Traveling,0
1,49,62263.017648,613.0,16902.153253,33,2,0.455917,1,Blue,Traveling,0
2,35,62414.862646,,21639.847552,1,1,0.81757,3,Blue,Traveling,0
3,63,62653.254368,363.0,17216.445061,3,0,0.519046,3,Green,Reading,1
4,28,54601.849437,607.0,22955.590991,22,4,0.197581,1,Yellow,Reading,0


In [6]:
display(test_df.head())

Unnamed: 0,Age,Income,CreditScore,LoanAmount,EmploymentYears,NumDependents,DebtToIncome,EducationLevel,FavoriteColor,Hobby,Default
20000,65,46268.170006,799.0,21006.486795,28,2,0.408134,4,Yellow,Sports,0
20001,60,53019.204347,532.0,16096.908958,13,0,0.689294,1,Blue,Sports,0
20002,68,36639.270263,490.0,21399.505528,18,0,0.440931,3,Blue,Reading,1
20003,44,32326.717615,726.0,23690.410875,6,0,0.981147,4,Red,Reading,0
20004,49,50226.96616,798.0,21018.127774,24,4,0.796241,1,Red,Sports,0


In [7]:
print(f"Train: {len(train_df)}, Test: {len(test_df)}")


Train: 20000, Test: 5000
Default rate in train: 34.31%


In [8]:
print(">>> TRAIN")
print(train_df.info())
print(f"Shape: {train_df.shape}")
print(f"Default rate: {train_df['Default'].value_counts(normalize=True)}")
print(f"Default rate in train: {train_df['Default'].mean():.2%}")
print(f"Missing values:\n{train_df.isnull().sum()}")

print("\n>>> TEST ")
print(test_df.info())
print(f"Shape: {test_df.shape}")
print(f"Has Default? {'Default' in test_df.columns}")
if 'Default' in test_df.columns:
    print(f"Default rate: {test_df['Default'].value_counts(normalize=True)}")
    print(f"Default rate in test: {test_df['Default'].mean():.2%}")

>>> TRAIN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              20000 non-null  int64  
 1   Income           20000 non-null  float64
 2   CreditScore      17617 non-null  float64
 3   LoanAmount       20000 non-null  float64
 4   EmploymentYears  20000 non-null  int64  
 5   NumDependents    20000 non-null  int64  
 6   DebtToIncome     20000 non-null  float64
 7   EducationLevel   20000 non-null  int64  
 8   FavoriteColor    20000 non-null  object 
 9   Hobby            20000 non-null  object 
 10  Default          20000 non-null  int64  
dtypes: float64(4), int64(5), object(2)
memory usage: 1.7+ MB
None
Shape: (20000, 11)
Default rate: Default
0    0.65685
1    0.34315
Name: proportion, dtype: float64
Default rate in train: 34.31%
Missing values:
Age                   0
Income                0
CreditScore        238