In [1]:
import sys
import warnings
import os
sys.path.append("../")
warnings.filterwarnings('ignore')
os.environ['MLFLOW_TRACKING_URI'] = 'your mlflow traking uri'
os.environ['AWS_ACCESS_KEY_ID'] = 'your aws access key id'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'your secret access key'
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'your mlflow s3 endpoint url'

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
import optuna
from src.data_loading.object_storage import get_data
from src.data_processing.transform_data import transform
from src.models.training import catboost_training, catboost_optuna_training, ridge_training, ridge_optuna_training

In [3]:
df = get_data('dcs-bucket','train.csv')

In [4]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
df = transform(df)

In [6]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [7]:
categorical_columns = df.select_dtypes(include=object).columns
numerical_columns = df.select_dtypes(include=[int, float]).columns
numerical_columns = numerical_columns.drop('SalePrice')

In [8]:
X = df.drop('SalePrice', axis = 1)
y = df['SalePrice']

Catboost without tuning

In [9]:
numerical_transformer = SimpleImputer(strategy='median')

categorical_transformer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')

preprocessing = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_columns),
    ('cat', categorical_transformer, categorical_columns)
])

In [10]:
df_transformed_train = preprocessing.fit_transform(X)
X_processed = pd.DataFrame(df_transformed_train, columns=list(numerical_columns) + list(categorical_columns))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, train_size=0.75, random_state=42)
model = CatBoostRegressor(cat_features=list(categorical_columns), silent=True)

In [12]:
run_name = 'second_attempt'
catboost_training(model, X_train, X_test, y_train, y_test, run_name)

Registered model 'basic catboost' already exists. Creating a new version of this model...
2024/06/14 18:42:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: basic catboost, version 10
Created version '10' of model 'basic catboost'.


Catboost optuna tuning

In [13]:
optuna.logging.set_verbosity(optuna.logging.ERROR)
best_catboost = catboost_optuna_training(X_train, X_test, y_train, y_test, categorical_columns, 2)

Registered model 'catboost optuna tuning' already exists. Creating a new version of this model...
2024/06/14 18:43:06 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: catboost optuna tuning, version 12
Created version '12' of model 'catboost optuna tuning'.


Ridge without tuning

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

In [15]:
num_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale',StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.NaN, strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessing = ColumnTransformer(transformers=[
    ('num', num_transformer, numerical_columns),
    ('cat', cat_transformer, categorical_columns)
])

model = Ridge()

lr_pipeline = Pipeline(steps=[
    ('preprocess', preprocessing),
    ('model', model)
])


In [16]:
run_name = 'second_attempt'
ridge_training(lr_pipeline, X_train, X_test, y_train, y_test, run_name)

Registered model 'basic ridge' already exists. Creating a new version of this model...
2024/06/14 18:43:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: basic ridge, version 5
Created version '5' of model 'basic ridge'.


Ridge optuna tuning

In [17]:
optuna.logging.set_verbosity(optuna.logging.ERROR)
best_ridge = ridge_optuna_training(preprocessing, X_train, X_test, y_train, y_test, 2)

Registered model 'ridge optuna tuning' already exists. Creating a new version of this model...
2024/06/14 18:43:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge optuna tuning, version 6
Created version '6' of model 'ridge optuna tuning'.
