In [1]:
# An advance with respect prep_04, adding SibSp and Parch to the model.

In [2]:
import pandas as pd
from dotenv import load_dotenv
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
import pickle
from sklearn.impute import SimpleImputer

load_dotenv()

os.chdir(os.getenv('ROOT_DIR'))

from src import utils

In [3]:
# --- PARAMS --- #
train_path = os.getenv('DF_TRAIN_PATH')
print(train_path)

data/raw/train.csv


In [4]:
# --- LOAD DATA --- #
train_data = pd.read_csv(train_path)
train_data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [5]:
# --- TRAIN TEST SPLIT --- #
features = ['Pclass', 'Sex', 'Age', 'Name', 'SibSp', 'Parch']
target = ['Survived']
x_train, x_test, y_train, y_test = train_test_split(train_data[features], train_data[target], test_size=0.2, random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(712, 6) (179, 6) (712, 1) (179, 1)


In [6]:
# --- SEX FEATURE PREPROCESSING --- #

# fit encoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder_sex_model = encoder.fit(x_train['Sex'].to_numpy().reshape(-1, 1))

# save
artifact_path = os.path.join(os.getenv('ARTIFACTS_PATH'), utils.get_nb_name())
os.makedirs(artifact_path, exist_ok=True)

with open(os.path.join(artifact_path, 'encoder_sex_model.pkl'), 'wb') as f:
    pickle.dump(encoder_sex_model, f)
    
print(f'Encoder sex model saved at {artifact_path}')

# transform
with open(os.path.join(artifact_path, 'encoder_sex_model.pkl'), 'rb') as f:
	encoder_sex_model = pickle.load(f)
      
x_train[encoder_sex_model.get_feature_names_out(['Sex'])] = encoder_sex_model.transform(x_train['Sex'].to_numpy().reshape(-1, 1))
x_train.drop('Sex', axis=1, inplace=True)

x_test[encoder_sex_model.get_feature_names_out(['Sex'])] = encoder_sex_model.transform(x_test['Sex'].to_numpy().reshape(-1, 1))
x_test.drop('Sex', axis=1, inplace=True)

x_train.head(2)

Encoder sex model saved at artifacts/prep_05


Unnamed: 0,Pclass,Age,Name,SibSp,Parch,Sex_female,Sex_male
140,3,,"Boulos, Mrs. Joseph (Sultana)",0,2,1.0,0.0
439,2,31.0,"Kvillner, Mr. Johan Henrik Johannesson",0,0,0.0,1.0


In [7]:
# --- PCLASS FEATURE PREPROCESSING --- #
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder_pclass_model = encoder.fit(x_train['Pclass'].to_numpy().reshape(-1, 1))

# save
artifact_path = os.path.join(os.getenv('ARTIFACTS_PATH'), utils.get_nb_name())
os.makedirs(artifact_path, exist_ok=True)

with open(os.path.join(artifact_path, 'encoder_pclass_model.pkl'), 'wb') as f:
	pickle.dump(encoder_pclass_model, f)

print(f'Encoder pclass model saved at {artifact_path}')

# transform
with open(os.path.join(artifact_path, 'encoder_pclass_model.pkl'), 'rb') as f:
	encoder_pclass_model = pickle.load(f)

x_train[encoder_pclass_model.get_feature_names_out(['Pclass'])] = encoder_pclass_model.transform(x_train['Pclass'].to_numpy().reshape(-1, 1))
x_train.drop('Pclass', axis=1, inplace=True)

x_test[encoder_pclass_model.get_feature_names_out(['Pclass'])] = encoder_pclass_model.transform(x_test['Pclass'].to_numpy().reshape(-1, 1))
x_test.drop('Pclass', axis=1, inplace=True)

x_train.head(2)

Encoder pclass model saved at artifacts/prep_05


Unnamed: 0,Age,Name,SibSp,Parch,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
140,,"Boulos, Mrs. Joseph (Sultana)",0,2,1.0,0.0,0.0,0.0,1.0
439,31.0,"Kvillner, Mr. Johan Henrik Johannesson",0,0,0.0,1.0,0.0,1.0,0.0


In [8]:
# --- AGE FEATURE PREPROCESSING --- # 

imputer_age_mean = SimpleImputer(strategy='mean')
imputer_age_mean.fit(x_train['Age'].values.reshape(-1, 1))

artifact_path = os.path.join(os.getenv('ARTIFACTS_PATH'), utils.get_nb_name())
os.makedirs(artifact_path, exist_ok=True)

with open(os.path.join(artifact_path, 'imputer_age_mean.pkl'), 'wb') as f:
    pickle.dump(imputer_age_mean, f)
    
x_train['Age'] = imputer_age_mean.transform(x_train['Age'].values.reshape(-1, 1))
x_test['Age'] = imputer_age_mean.transform(x_test['Age'].values.reshape(-1, 1))

x_train.head(2)


Unnamed: 0,Age,Name,SibSp,Parch,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
140,29.745184,"Boulos, Mrs. Joseph (Sultana)",0,2,1.0,0.0,0.0,0.0,1.0
439,31.0,"Kvillner, Mr. Johan Henrik Johannesson",0,0,0.0,1.0,0.0,1.0,0.0


In [9]:
# --- NAME FEATURE PREPROCESSING --- #
regex_title = r',\s*(\w*)'
x_train['Title'] = x_train['Name'].str.extract(regex_title)
x_test['Title'] = x_test['Name'].str.extract(regex_title)

title_mapping = {
    'Mme' : 'Mrs',
    'Mlle': 'Miss',
    'Ms' : 'Miss',
    'Don' : 'Sir',
    'Jonkheer':'Sir',
    'Major':'Military',
    'Capt': 'Military',
    'the' : 'Mrs',
    'Lady': 'Mrs',
    'Col' : 'Military'
}

x_train['Title'] = x_train['Title'].replace(title_mapping)
x_test['Title'] = x_test['Title'].replace(title_mapping)

# encoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder_title_model = encoder.fit(x_train['Title'].to_numpy().reshape(-1, 1))

artifact_path = os.path.join(os.getenv('ARTIFACTS_PATH'), utils.get_nb_name())
os.makedirs(artifact_path, exist_ok=True)

with open(os.path.join(artifact_path, 'encoder_title_model.pkl'), 'wb') as f:
    pickle.dump(encoder_title_model, f)
    
print(f'Encoder title model saved at {artifact_path}')

# transform
with open(os.path.join(artifact_path, 'encoder_title_model.pkl'), 'rb') as f:
	encoder_title_model = pickle.load(f)
      
x_train[encoder_title_model.get_feature_names_out(['Title'])] = encoder_title_model.transform(x_train['Title'].to_numpy().reshape(-1, 1))
x_train.drop('Title', axis=1, inplace=True)
x_train.drop('Name', axis=1, inplace=True)

x_test[encoder_title_model.get_feature_names_out(['Title'])] = encoder_title_model.transform(x_test['Title'].to_numpy().reshape(-1, 1))
x_test.drop('Title', axis=1, inplace=True)
x_test.drop('Name', axis=1, inplace=True)

Encoder title model saved at artifacts/prep_05


In [10]:
x_train.head(2)

Unnamed: 0,Age,SibSp,Parch,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Title_Dr,Title_Master,Title_Military,Title_Miss,Title_Mr,Title_Mrs,Title_Rev,Title_Sir
140,29.745184,0,2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
439,31.0,0,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [11]:
# --- SAVE DATA --- #
utils.save_prep_data_4(x_train, x_test, y_train, y_test)
print(f'Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

Data saved in data/processed/prep_05
Timestamp: 2024-10-23 06:54:23
