# Import Libraries

In [20]:
# %load_ext autoreload
%reload_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from IPython import display

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [None]:
# params_cfg = {
#     "action"  : "main_feat01",  
#     "seed"    : 42, # Set random seed
#     "exp_dir" : os.path.abspath('../../exps'),
#     'exp_name': 'featbase_251028',
#     "data_dir": os.path.abspath("../../data/titanic"),
#     "verbose" : True,
# }
params_cfg = {
    "action"   : "train_feat01",  
    "feat_path": "../../exps/featbase_251028/data.npz",
    "seed"    : 42, # Set random seed
    "exp_dir" : os.path.abspath('../../exps'),
    'exp_name': 'trainbase_251028',
    "data_dir": os.path.abspath("../../data/titanic"),
    "verbose" : True,
}
params_cfg.update(**{
    "save_dir": os.path.abspath(f'{params_cfg["exp_dir"]}/{params_cfg["exp_name"]}')
})

for v in params_cfg:
    print(f'+ {v}: {params_cfg[v]}')

globals().update(**params_cfg)

+ action: main_feat01
+ seed: 42
+ exp_dir: h:\My Drive\DNTAI\2025\teaching\sgu25k2_machine_learning_basic\ws\exps
+ exp_name: featbase_251028
+ data_dir: h:\My Drive\DNTAI\2025\teaching\sgu25k2_machine_learning_basic\ws\data\titanic
+ verbose: True
+ save_dir: h:\My Drive\DNTAI\2025\teaching\sgu25k2_machine_learning_basic\ws\exps\featbase_251028


# Data Load

In [149]:
df_train = pd.read_csv(f'{data_dir}/train.csv')
df_test = pd.read_csv(f'{data_dir}/test.csv')

if params_cfg["verbose"]:
    print("-"*10, "information", "-"*10)
    print(f'train-col: {set(df_train.columns)}')
    print(f'test-col: {set(df_test.columns)}')
    print("Union:", set(df_train.columns).intersection(set(df_test.columns)))
    print("Difference:", set(df_train.columns).difference(set(df_test.columns)))

---------- information ----------
train-col: {'Parch', 'Sex', 'Cabin', 'Age', 'Embarked', 'PassengerId', 'Ticket', 'Survived', 'Fare', 'Name', 'Pclass', 'SibSp'}
test-col: {'Parch', 'Sex', 'Cabin', 'Age', 'Embarked', 'PassengerId', 'Ticket', 'Fare', 'Name', 'Pclass', 'SibSp'}
Union: {'Parch', 'Cabin', 'Sex', 'SibSp', 'Age', 'Embarked', 'PassengerId', 'Ticket', 'Name', 'Pclass', 'Fare'}
Difference: {'Survived'}


# Preprocessing

In [152]:
def preprocessing_feature_01(df_data, is_train = True, is_debug = True, **kwargs):
    df_output = pd.DataFrame()

    # Sex: gioi tinh
    cls_sex = {'female': 0, 'male' : 1}
    df_output["Sex"] = df_data["Sex"].apply(lambda x: cls_sex[x])
    # Age: median
    df_output["Age"] = df_data["Age"].fillna(df_data["Age"].median())
    # Fare, Pclass
    for name in ['Fare', 'Pclass', 'SibSp', 'Parch']:
        df_output[name] = df_data[name]
    # Cabin
    cls_cabin = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8, 'Z':0}
    df_output["Cabin"] = df_data['Cabin'].apply(lambda x: cls_cabin['Z'] if pd.isna(x) else cls_cabin[x[0]])
    # Embarked
    cls_embarked = {'0': 0, 'C':1, 'Q':2, 'S':3}
    df_output["Embarked"] =  df_data['Embarked'].apply(lambda x: cls_embarked['0'] if pd.isna(x) else cls_embarked[x])
    # Surname
    surnames = ['Capt.', 'Col.', 'Don.', 'Dr.', 'Jonkheer.', 'Lady.', 'Major.', 
            'Master.', 'Miss.', 'Mlle.', 'Mme.', 'Mr.', 'Mrs.', 'Ms.', 'Rev.', 'Sir.', 'the']
    cls_surnames = dict(zip(surnames, range(len(surnames))))
    df_output["Surname"] = df_data['Name'].apply(lambda x: cls_surnames[x.split(',')[1].split(' ')[1]])

    if is_train:
        df_output["Output"] = df_data["Survived"]

    # display.display(df_output)

    if is_debug:
        print("head(10)")
        display.display(df_data.head(5))
        print("tail(10)")
        # display.display(df_data.tail(5))
        print("isna")
        display.display(df_data.isna().sum())
        # Sex: gioi tinh
        print("sex")
        display.display(np.unique(df_data['Sex'], return_counts=True))
        # Age: lay median
        print(f'Age IsNa: {df_data["Age"].isna().sum()}')
        print(f"Age Median: {df_data['Age'].median()}")
        # Fare
        display.display(df_data["Fare"].describe())
        # Cabin
        print("-*10", "Cabin")
        display.display(np.unique(df_data['Cabin'].apply(
            lambda x: 'Z0' if pd.isna(x) else x), return_counts=True))
        # Embarked
        display.display(
            np.unique(df_data['Embarked'].apply(lambda x: '0' if pd.isna(x) else x), return_counts=True)
        )
        globals().update(**locals())
    
    return df_output, None
    pass

# df_train = pd.read_csv(f'{data_dir}/train.csv')
# preprocessing_feature_01(df_train)

# Train

# Main

In [155]:
def main_feat01(**kwargs):
    # load data
    df_train = pd.read_csv(f'{data_dir}/train.csv')
    df_test = pd.read_csv(f'{data_dir}/test.csv')
    # preprocessing
    df_output_train, _ = preprocessing_feature_01(df_train, is_train = True, is_debug = False)
    df_output_test, _ = preprocessing_feature_01(df_train, is_train = False, is_debug = False)
    
    # saving
    os.makedirs(save_dir, exist_ok = True)
    
    np.savez(f'{save_dir}/data.npz', train=df_output_train, test=df_output_test)

    kwargs.get('global_cfg', {}).update(**locals())
    pass

if params_cfg["action"] == "main_feat01":
    print("Runing ... [main_feat01]")
    main_feat01(global_cfg = globals())

Runing ... [main_feat01]


# End