In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder

# Custom import
from src.data_manager import load_data

# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600
%load_ext lab_black

%load_ext autoreload
%autoreload 2

In [2]:
# Load Data
data = load_data(filename="../../data/student-por.csv", sep=";")

data.head()

Shape of data: (649, 33)

Duration: 0.01 seconds


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13


### Column Transformer

```python
from sklearn.compose import ColumnTransformer
```

In [3]:
ohe = OneHotEncoder(dtype=int, handle_unknown="ignore")
transf_array = ohe.fit_transform(X=data)
transf_array.toarray()

array([[1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [4]:
print(f"Input features: \n{ohe.feature_names_in_}\n")
print(f"Output features: \n{ohe.get_feature_names_out()}\n")

Input features: 
['school' 'sex' 'age' 'address' 'famsize' 'Pstatus' 'Medu' 'Fedu' 'Mjob'
 'Fjob' 'reason' 'guardian' 'traveltime' 'studytime' 'failures'
 'schoolsup' 'famsup' 'paid' 'activities' 'nursery' 'higher' 'internet'
 'romantic' 'famrel' 'freetime' 'goout' 'Dalc' 'Walc' 'health' 'absences'
 'G1' 'G2' 'G3']

Output features: 
['school_GP' 'school_MS' 'sex_F' 'sex_M' 'age_15' 'age_16' 'age_17'
 'age_18' 'age_19' 'age_20' 'age_21' 'age_22' 'address_R' 'address_U'
 'famsize_GT3' 'famsize_LE3' 'Pstatus_A' 'Pstatus_T' 'Medu_0' 'Medu_1'
 'Medu_2' 'Medu_3' 'Medu_4' 'Fedu_0' 'Fedu_1' 'Fedu_2' 'Fedu_3' 'Fedu_4'
 'Mjob_at_home' 'Mjob_health' 'Mjob_other' 'Mjob_services' 'Mjob_teacher'
 'Fjob_at_home' 'Fjob_health' 'Fjob_other' 'Fjob_services' 'Fjob_teacher'
 'reason_course' 'reason_home' 'reason_other' 'reason_reputation'
 'guardian_father' 'guardian_mother' 'guardian_other' 'traveltime_1'
 'traveltime_2' 'traveltime_3' 'traveltime_4' 'studytime_1' 'studytime_2'
 'studytime_3' 'studytime

In [5]:
# All the features are encoded inluding the numeric columns
# and that is not the desired behaviour we want.
df = pd.DataFrame(
    ohe.fit_transform(X=data).toarray(),
    columns=ohe.get_feature_names_out(),
)

df.head()

Unnamed: 0,school_GP,school_MS,sex_F,sex_M,age_15,age_16,age_17,age_18,age_19,age_20,age_21,age_22,address_R,address_U,famsize_GT3,famsize_LE3,Pstatus_A,Pstatus_T,Medu_0,Medu_1,Medu_2,Medu_3,Medu_4,Fedu_0,Fedu_1,Fedu_2,Fedu_3,Fedu_4,Mjob_at_home,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_other,reason_reputation,guardian_father,guardian_mother,guardian_other,traveltime_1,traveltime_2,traveltime_3,traveltime_4,studytime_1,studytime_2,studytime_3,studytime_4,failures_0,failures_1,failures_2,failures_3,schoolsup_no,schoolsup_yes,famsup_no,famsup_yes,paid_no,paid_yes,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes,famrel_1,famrel_2,famrel_3,famrel_4,famrel_5,freetime_1,freetime_2,freetime_3,freetime_4,freetime_5,goout_1,goout_2,goout_3,goout_4,goout_5,Dalc_1,Dalc_2,Dalc_3,Dalc_4,Dalc_5,Walc_1,Walc_2,Walc_3,Walc_4,Walc_5,health_1,health_2,health_3,health_4,health_5,absences_0,absences_1,absences_2,absences_3,absences_4,absences_5,absences_6,absences_7,absences_8,absences_9,absences_10,absences_11,absences_12,absences_13,absences_14,absences_15,absences_16,absences_18,absences_21,absences_22,absences_24,absences_26,absences_30,absences_32,G1_0,G1_4,G1_5,G1_6,G1_7,G1_8,G1_9,G1_10,G1_11,G1_12,G1_13,G1_14,G1_15,G1_16,G1_17,G1_18,G1_19,G2_0,G2_5,G2_6,G2_7,G2_8,G2_9,G2_10,G2_11,G2_12,G2_13,G2_14,G2_15,G2_16,G2_17,G2_18,G2_19,G3_0,G3_1,G3_5,G3_6,G3_7,G3_8,G3_9,G3_10,G3_11,G3_12,G3_13,G3_14,G3_15,G3_16,G3_17,G3_18,G3_19
0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,1,0,1,0,1,0,0,1,0,1,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0,1,0,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,1,0,1,0,0,1,0,1,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,1,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,1,0,0,1,0,1,0,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,1,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [6]:
# ColumnTransformer will be used to select the desired features/columns
cols_to_enc = data.select_dtypes(include=["O"]).columns

ohe_transformer = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(dtype=int, handle_unknown="ignore"), cols_to_enc)
    ],
    remainder="passthrough",
)


cat_data_transf = pd.DataFrame(
    data=ohe_transformer.fit_transform(data),
    columns=ohe_transformer.get_feature_names_out(),
)
# Clean the values
pattern = r"(ohe__|remainder__)"
cat_data_transf.columns = cat_data_transf.columns.str.replace(pattern, "", regex=True)
cat_data_transf.head()

Unnamed: 0,school_GP,school_MS,sex_F,sex_M,address_R,address_U,famsize_GT3,famsize_LE3,Pstatus_A,Pstatus_T,Mjob_at_home,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_other,reason_reputation,guardian_father,guardian_mother,guardian_other,schoolsup_no,schoolsup_yes,famsup_no,famsup_yes,paid_no,paid_yes,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,1,0,1,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,1,0,1,1,0,1,0,18,4,4,2,2,0,4,3,4,1,1,3,4,0,11,11
1,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,1,0,1,0,1,0,0,1,0,1,1,0,17,1,1,1,2,0,5,3,3,1,1,3,2,9,11,11
2,1,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,1,0,1,0,0,1,0,1,0,1,1,0,15,1,1,1,2,0,4,3,2,2,3,3,6,12,13,12
3,1,0,1,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0,1,0,1,0,1,15,4,2,1,3,0,3,2,2,1,1,5,0,14,14,14
4,1,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,1,0,1,0,0,1,0,1,1,0,1,0,16,3,3,1,2,0,4,3,2,1,2,5,0,11,13,13


<br>

### FunctionTransformer

> This is useful for stateless transformations such as taking the log of frequencies, doing custom scaling, etc.

* Example on how to get the column names from `FunctionTransformer` of can be found [here](https://github.com/scikit-learn/scikit-learn/pull/21569)

In [7]:
data.head(2)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11


In [8]:
def apply_log(data: pd.DataFrame) -> pd.DataFrame:
    """This is used to apply log transformation to the selected features."""
    data = data.copy()
    return np.log1p(data)

In [9]:
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.2
RANDOM_STATE = 123

X = data.drop(columns=["G3"])
y = data["G3"]

X_train, X_validation, y_train, y_validation = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train.shape, X_validation.shape

((519, 32), (130, 32))

In [10]:
def feature_names_out(transformer: FunctionTransformer, input_features: list):
    """This is used to retrieve the names of the transformed features from
    an sklearn pipeline using `.get_feature_names_out()`."""

    # tranformed features
    transf_feats = transformer.feature_names_in_
    return [f"log_{x}" for x in input_features if x in transf_feats]


log_transformer = FunctionTransformer(
    func=apply_log,
    feature_names_out=feature_names_out,
)

log_transformer.fit_transform(data[["G1", "G2"]]).head()

Unnamed: 0,G1,G2
0,0.0,2.484907
1,2.302585,2.484907
2,2.564949,2.639057
3,2.70805,2.70805
4,2.484907,2.639057


In [11]:
# Transfomed features
log_transformer.feature_names_in_

array(['G1', 'G2'], dtype=object)

In [12]:
log_transformer.get_feature_names_out()

array(['log_G1', 'log_G2'], dtype=object)

#### Putting It All Together

In [13]:
# ColumnTransformer will be used to select the desired features/columns
cols_to_enc = data.select_dtypes(include=["O"]).columns
grades = ["G1", "G2", "G3"]  # apply log transform

col_transformer = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(dtype=int, handle_unknown="ignore"), cols_to_enc),
        ("log_transform", log_transformer, grades),
    ],
    remainder="passthrough",
)


pipe = Pipeline(
    steps=[
        # Apply column transformer
        ("col_transformer", col_transformer)
    ]
)

pipe

In [14]:
pd.DataFrame(
    data=pipe.fit_transform(data),
    columns=pipe.get_feature_names_out(),
).head()

Unnamed: 0,ohe__school_GP,ohe__school_MS,ohe__sex_F,ohe__sex_M,ohe__address_R,ohe__address_U,ohe__famsize_GT3,ohe__famsize_LE3,ohe__Pstatus_A,ohe__Pstatus_T,ohe__Mjob_at_home,ohe__Mjob_health,ohe__Mjob_other,ohe__Mjob_services,ohe__Mjob_teacher,ohe__Fjob_at_home,ohe__Fjob_health,ohe__Fjob_other,ohe__Fjob_services,ohe__Fjob_teacher,ohe__reason_course,ohe__reason_home,ohe__reason_other,ohe__reason_reputation,ohe__guardian_father,ohe__guardian_mother,ohe__guardian_other,ohe__schoolsup_no,ohe__schoolsup_yes,ohe__famsup_no,ohe__famsup_yes,ohe__paid_no,ohe__paid_yes,ohe__activities_no,ohe__activities_yes,ohe__nursery_no,ohe__nursery_yes,ohe__higher_no,ohe__higher_yes,ohe__internet_no,ohe__internet_yes,ohe__romantic_no,ohe__romantic_yes,log_transform__log_G1,log_transform__log_G2,log_transform__log_G3,remainder__age,remainder__Medu,remainder__Fedu,remainder__traveltime,remainder__studytime,remainder__failures,remainder__famrel,remainder__freetime,remainder__goout,remainder__Dalc,remainder__Walc,remainder__health,remainder__absences
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,2.484907,2.484907,18.0,4.0,4.0,2.0,2.0,0.0,4.0,3.0,4.0,1.0,1.0,3.0,4.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,2.302585,2.484907,2.484907,17.0,1.0,1.0,1.0,2.0,0.0,5.0,3.0,3.0,1.0,1.0,3.0,2.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,2.564949,2.639057,2.564949,15.0,1.0,1.0,1.0,2.0,0.0,4.0,3.0,2.0,2.0,3.0,3.0,6.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,2.70805,2.70805,2.70805,15.0,4.0,2.0,1.0,3.0,0.0,3.0,2.0,2.0,1.0,1.0,5.0,0.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,2.484907,2.639057,2.639057,16.0,3.0,3.0,1.0,2.0,0.0,4.0,3.0,2.0,1.0,2.0,5.0,0.0


In [15]:
# Build regression model
from sklearn.linear_model import LinearRegression

cols_to_enc = data.select_dtypes(include=["O"]).columns
grades = ["G1", "G2"]  # apply log transform

col_transformer = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(dtype=int, handle_unknown="ignore"), cols_to_enc),
        ("log_transform", log_transformer, grades),
    ],
    remainder="passthrough",
)

l_pipe = Pipeline(
    steps=[
        # Apply column transformer
        ("col_transformer", col_transformer),
        ("linear_model", LinearRegression()),
    ]
)

l_pipe

In [16]:
# Train model
l_pipe.fit(X_train, y_train)

# Make predictions
y_pred = l_pipe.predict(X_validation)

# Evaluate
from src.metrics import evaluate_regression_model

print(evaluate_regression_model(y_true=y_validation, y_pred=y_pred))

Mean Squared Error (Lower is better!!): 2.219
Root Mean Squared Error (Lower is better!!): 1.49
Mean Absolute Error (Lower is better!!): 1.048
R Squared (Higher is better!!): 0.795 


In [17]:
# View the steps in the pipeline
l_pipe.named_steps.get("col_transformer")