In [1]:
import data_sourcing
import data_splitting
import data_preprocessing
import feature_engineering
import monitoring
import modeling

In [2]:
df = data_sourcing.get()
df = data_preprocessing.clean(df)
df = data_preprocessing.normalize(df)
train, valid, test = data_splitting.split(df)

# Categorical
cat_cols = [
    "sex",
]

train_hot, valid_hot, test_hot = feature_engineering.one_hot_encoding(
    train=train,
    valid=valid,
    test=test,
    cols=cat_cols,
)

train = train.join(train_hot)
valid = valid.join(valid_hot)
test = test.join(test_hot)

# Numerical
num_cols = [
    "bill_length_mm",
    "bill_depth_mm",
    "flipper_length_mm",
    "body_mass_g",
]
   
train_imp, valid_imp, test_imp = feature_engineering.numerical_missing_imputation(
        train=train,
        valid=valid,
        test=test,
        cols=num_cols,
        imputation_method="median",
)

train = train.join(train_imp, rsuffix="_imputed")
valid = valid.join(valid_imp, rsuffix="_imputed")
test = test.join(test_imp, rsuffix="_imputed")

In [3]:
train

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species,sex_male,sex_female,sex_na,bill_length_mm_imputed,bill_depth_mm_imputed,flipper_length_mm_imputed,body_mass_g_imputed
141,40.6,17.2,187.0,3475.0,male,Adelie,1,0,0,40.6,17.2,187.0,3475.0
6,38.9,17.8,181.0,3625.0,female,Adelie,0,1,0,38.9,17.8,181.0,3625.0
60,35.7,16.9,185.0,3150.0,female,Adelie,0,1,0,35.7,16.9,185.0,3150.0
249,46.9,14.6,222.0,4875.0,female,Gentoo,0,1,0,46.9,14.6,222.0,4875.0
54,34.5,18.1,187.0,2900.0,female,Adelie,0,1,0,34.5,18.1,187.0,2900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,36.0,17.8,195.0,3450.0,female,Adelie,0,1,0,36.0,17.8,195.0,3450.0
143,40.7,17.0,190.0,3725.0,male,Adelie,1,0,0,40.7,17.0,190.0,3725.0
180,48.2,14.3,210.0,4600.0,female,Gentoo,0,1,0,48.2,14.3,210.0,4600.0
131,43.1,19.2,197.0,3500.0,male,Adelie,1,0,0,43.1,19.2,197.0,3500.0


In [6]:
train_out, test_out, valid_out = modeling.fit_transform(
    train,
    valid,
    test,
    y_col=['species'],
    x_col=["sex_male",
           "sex_female",
           "sex_na",
           "bill_length_mm_imputed",
           "bill_depth_mm_imputed",
           "flipper_length_mm_imputed",
           "body_mass_g_imputed"
    ]
)

  return f(*args, **kwargs)


In [9]:
list(train_out)

[(array([0.88307123, 0.10987803, 0.00705074]), 0),
 (array([0.87956056, 0.11662439, 0.00381506]), 0),
 (array([0.89956792, 0.09717511, 0.00325698]), 0),
 (array([0.00300028, 0.00560036, 0.99139936]), 2),
 (array([0.88625317, 0.11075319, 0.00299364]), 0),
 (array([0.32098904, 0.64867173, 0.03033923]), 1),
 (array([0.87241916, 0.12394886, 0.00363198]), 0),
 (array([0.00403583, 0.02740437, 0.96855979]), 2),
 (array([0.00300028, 0.00568166, 0.99131806]), 2),
 (array([0.86513333, 0.12975994, 0.00510673]), 0),
 (array([0.7259461 , 0.21749647, 0.05655743]), 0),
 (array([0.00992871, 0.01200151, 0.97806978]), 2),
 (array([0.85095957, 0.07326035, 0.07578008]), 0),
 (array([0.35227085, 0.61190793, 0.03582122]), 1),
 (array([0.00818858, 0.00857681, 0.98323461]), 2),
 (array([0.83378404, 0.15956048, 0.00665548]), 0),
 (array([0.89397191, 0.10277111, 0.00325698]), 0),
 (array([0.00447845, 0.00714956, 0.98837199]), 2),
 (array([0.88384964, 0.11238784, 0.00376252]), 0),
 (array([0.87839778, 0.11708753