In [8]:
import os, json, pathlib as pl
import numpy as np
import pandas as pd

RAW = pl.Path("data/raw/home_credit")
TRAIN = RAW / "application_train.csv"
TEST  = RAW / "application_test.csv"

In [10]:
#Getting the dataframes
train = pd.read_csv(TRAIN)
test  = pd.read_csv(TEST)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
#Class imbalance check
pos_rate = train["TARGET"].mean()
train["TARGET"].value_counts(dropna=False), pos_rate

(TARGET
 0    282686
 1     24825
 Name: count, dtype: int64,
 np.float64(0.08072881945686496))

In [13]:
#Duplicates and overlaps chheck
ID_COL = "SK_ID_CURR"
dups = train.duplicated(subset=[ID_COL]).sum()
overlap = np.intersect1d(train[ID_COL].values, test[ID_COL].values).size
dups, overlap

(np.int64(0), 0)

In [15]:
#Amount of num features and cat features
target_and_id = {ID_COL, "TARGET"}
num_cols = train.select_dtypes(include=[np.number]).columns.difference(target_and_id)
cat_cols = [c for c in train.columns if c not in set(num_cols).union(target_and_id)]
len(num_cols), len(cat_cols), cat_cols[:15]

(104,
 16,
 ['NAME_CONTRACT_TYPE',
  'CODE_GENDER',
  'FLAG_OWN_CAR',
  'FLAG_OWN_REALTY',
  'NAME_TYPE_SUITE',
  'NAME_INCOME_TYPE',
  'NAME_EDUCATION_TYPE',
  'NAME_FAMILY_STATUS',
  'NAME_HOUSING_TYPE',
  'OCCUPATION_TYPE',
  'WEEKDAY_APPR_PROCESS_START',
  'ORGANIZATION_TYPE',
  'FONDKAPREMONT_MODE',
  'HOUSETYPE_MODE',
  'WALLSMATERIAL_MODE'])

In [16]:
#Check for cardinality of cat features
card = train[cat_cols].nunique(dropna=False).sort_values(ascending=False)
card.head(20)

ORGANIZATION_TYPE             58
OCCUPATION_TYPE               19
NAME_INCOME_TYPE               8
NAME_TYPE_SUITE                8
WALLSMATERIAL_MODE             8
WEEKDAY_APPR_PROCESS_START     7
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
NAME_EDUCATION_TYPE            5
FONDKAPREMONT_MODE             5
HOUSETYPE_MODE                 4
CODE_GENDER                    3
EMERGENCYSTATE_MODE            3
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_CONTRACT_TYPE             2
dtype: int64

In [17]:
#Missing rates
missing_pct = train.isna().mean().sort_values(ascending=False)
missing_summary = missing_pct.to_frame("missing_rate").query("missing_rate > 0")
missing_summary.head(20)

Unnamed: 0,missing_rate
COMMONAREA_AVG,0.698723
COMMONAREA_MODE,0.698723
COMMONAREA_MEDI,0.698723
NONLIVINGAPARTMENTS_MEDI,0.69433
NONLIVINGAPARTMENTS_MODE,0.69433
NONLIVINGAPARTMENTS_AVG,0.69433
FONDKAPREMONT_MODE,0.683862
LIVINGAPARTMENTS_AVG,0.68355
LIVINGAPARTMENTS_MEDI,0.68355
LIVINGAPARTMENTS_MODE,0.68355


In [18]:
#Constant features check
nunique = train.nunique()
constant_cols = nunique[nunique <= 1].index.tolist()
len(constant_cols), constant_cols[:10]

(0, [])

In [19]:
#Kfolds
from sklearn.model_selection import StratifiedKFold

X = train.drop(columns=[ID_COL, "TARGET"])
y = train["TARGET"].values

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2501)

folds = []
for fold, (_, val_idx) in enumerate(cv.split(X, y)):
    folds.append({"fold": int(fold), "val_index": val_idx.tolist()})

os.makedirs("experiments/cv", exist_ok=True)          # experiments/ is gitignored
with open("experiments/cv/home_credit_folds.json", "w") as f:
    json.dump(folds, f)

len(folds), [len(f["val_index"]) for f in folds]

(5, [61503, 61502, 61502, 61502, 61502])

Metrics: ROC-AUC and PR-AUC (cuz of the imbalance)