In [188]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

from pathlib import Path

In [2]:
path= r'C:\Users\Anjelito\Documents\personal project\regression - student grade\data\intermediate\student-mat-cleaned.parquet'
df= pd.read_parquet(path)

In [3]:
label= 'score'
X= df.drop(columns= label).copy()
y= df[label].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size= 0.33, random_state= 8,
    shuffle= True)

In [4]:
num_features= X_train.select_dtypes(include= 'number').columns.to_list()
num_features.remove('absences')

num_sqrt_features= ['absences']

cat_features= X_train.select_dtypes(include= 'category').columns.to_list()

bool_features= X_train.select_dtypes(include= 'bool').columns.to_list()

# make sure that all cols have been included
len(X_train.columns) - (len(num_features) + len(num_sqrt_features) + len(cat_features) + len(bool_features))

0

In [5]:
# define log transformer
class SquareRootScaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y= None):
        return self
    def transform(self, X, y= None):
        X= np.sqrt(X)
        return X        
        
# define bool transformer
class BoolConverter(BaseEstimator, TransformerMixin):
    def fit(self, X, y= None):
        return self
    def transform(self, X, y= None):
        X= X.values * 1
        return X        

In [6]:
# numerical pipeline
num_pipe= Pipeline([
    ('standard_scaler', StandardScaler())
])

# numerical log pipeline
num_sqrt_pipe= Pipeline([
    ('sqrt_scaler', SquareRootScaler()),
    ('standard_scaler', StandardScaler())
])

# boolean pipeline
bool_pipe= Pipeline([
    ('bool_converter', BoolConverter()),
    ('standard_scaler', StandardScaler())
])

# cat pipeline
cat_pipe= Pipeline([
    ('one_hot', OneHotEncoder(drop= 'first'))
])

In [21]:
preprocessing= ColumnTransformer([
    ('num', num_pipe, num_features),
    ('num_sqrt', num_sqrt_pipe, num_sqrt_features),
    ('cat', cat_pipe, cat_features),
    ('bool', bool_pipe, bool_features),
])

X_train_preprocessed= preprocessing.fit_transform(X_train)
X_test_preprocessed= preprocessing.transform(X_test)


# convert to df to check the result

num_col_names= num_features
num_sqrt_col_names= num_sqrt_features
cat_col_names= list(preprocessing.transformers_[2][1].get_feature_names_out())
bool_col_names= bool_features

col_names= num_col_names + num_sqrt_col_names + cat_col_names + bool_col_names

X_train_preprocessed= pd.DataFrame(X_train_preprocessed, columns= col_names)
X_test_preprocessed= pd.DataFrame(X_test_preprocessed, columns= col_names)
X_train_preprocessed.head()

Unnamed: 0,age,failures,famrel,freetime,goout,dalc,walc,health,absences,school_MS,...,studytime_3,studytime_4,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
0,-0.573042,-0.445697,0.052827,-0.247625,-1.004776,-0.527089,-0.984821,1.06483,-1.191346,0.0,...,0.0,1.0,-0.377964,0.806226,1.046536,-0.977525,0.471405,0.218218,0.434959,-0.713142
1,-0.573042,-0.445697,0.052827,1.763851,-1.004776,-0.527089,-0.984821,1.06483,1.646643,0.0,...,0.0,0.0,-0.377964,-1.240347,-0.955533,-0.977525,0.471405,0.218218,0.434959,1.402245
2,1.803131,0.906763,0.052827,-0.247625,-1.004776,-0.527089,0.5814,1.06483,-1.191346,1.0,...,0.0,0.0,-0.377964,-1.240347,-0.955533,-0.977525,0.471405,0.218218,-2.299068,-0.713142
3,0.219016,-0.445697,-2.271563,-2.259101,-1.871641,-0.527089,-0.984821,-0.365781,-0.293895,0.0,...,0.0,0.0,-0.377964,-1.240347,-0.955533,1.022992,0.471405,0.218218,-2.299068,-0.713142
4,0.219016,-0.445697,0.052827,-1.253363,-1.871641,-0.527089,-0.984821,0.349524,-1.191346,0.0,...,0.0,0.0,2.645751,0.806226,-0.955533,-0.977525,0.471405,0.218218,-2.299068,1.402245


In [196]:
path= Path(r'C:\Users\Anjelito\Documents\personal project\regression - student grade\data\dataset')

X_train_preprocessed.to_csv(path/'X_train.csv', index= False)
X_test_preprocessed.to_csv(path/'X_test.csv', index= False)

y_train.to_csv(path/'y_train.csv', index= False)
y_test.to_csv(path/'y_test.csv', index= False)