In [26]:
# Basic prep only to example purposes.

In [27]:
import pandas as pd
from dotenv import load_dotenv
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing	import OneHotEncoder
from datetime import datetime
import pickle

load_dotenv()
os.chdir(os.getenv("ROOT_DIR"))

from src import utils


In [28]:
# --- PARAMS --- #
df_train_path = os.getenv("DF_TRAIN_PATH")

print(df_train_path)

data/raw/train.csv


In [29]:
# --- LOAD DATA --- # 
df_train = pd.read_csv(df_train_path)
df_train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [30]:
# --- TRAIN TEST SPLIT --- #
features = ['Pclass', 'Sex']
target = ['Survived']
x_train, x_test, y_train, y_test = train_test_split(df_train[features], df_train[target], test_size=0.2, random_state=0)
(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

((712, 2), (179, 2), (712, 1), (179, 1))

In [31]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 140 to 684
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Pclass  712 non-null    int64 
 1   Sex     712 non-null    object
dtypes: int64(1), object(1)
memory usage: 16.7+ KB


In [32]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 495 to 372
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Pclass  179 non-null    int64 
 1   Sex     179 non-null    object
dtypes: int64(1), object(1)
memory usage: 4.2+ KB


In [33]:
x_test.head()

Unnamed: 0,Pclass,Sex
495,3,male
648,3,male
278,3,male
31,1,female
255,3,female


In [34]:
# --- SEX feature preprocessing ---
encoder = OneHotEncoder(handle_unknown='ignore', categories='auto', sparse_output=False)
encoder_sex_model = encoder.fit(x_train[['Sex']])

# save!
artifact_path = os.path.join(os.getenv("ARTIFACTS_PATH"), utils.get_nb_name())
os.makedirs(artifact_path, exist_ok=True)

with open(os.path.join(artifact_path, 'encoder_sex_model.pkl'), 'wb') as f:
          pickle.dump(encoder_sex_model,f)

print(f"Encoder model saved at {os.path.join(artifact_path, 'encoder_sex_model.pkl')}")

Encoder model saved at artifacts/prep_01/encoder_sex_model.pkl


In [35]:
with open(os.path.join(artifact_path, 'encoder_sex_model.pkl'), 'rb') as f:
         encoder_sex_model = pickle.load(f)

In [36]:
x_train[encoder_sex_model.get_feature_names_out()] = encoder_sex_model.transform(x_train[['Sex']])
x_train.drop('Sex', axis=1,	inplace=True)

x_test[encoder_sex_model.get_feature_names_out()] = encoder_sex_model.transform(x_test[['Sex']])
x_test.drop('Sex', axis=1, inplace=True)

In [37]:
# --- SAVE DATA --- #
utils.save_prep_data_4(x_train, x_test, y_train, y_test)
print(f'Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

Data saved in data/processed/prep_01
Timestamp: 2024-10-10 20:20:40
