In [None]:
import numpy as np
import pandas as pd
from splitter import splitter

X_train, X_test, y_train, y_test = splitter(".//topic21_v9_train.csv")

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import FunctionTransformer
from sklearn.experimental import enable_iterative_imputer  # needed to enable
from sklearn.impute import IterativeImputer # for the actual model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import IsolationForest 
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor

In [None]:
# baseline

def drop(df): 
    return df.drop(columns='transmission_type')

cat_processor = Pipeline([
    ("drop", FunctionTransformer(drop)),
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
        ('cat', cat_processor, X_train.select_dtypes(include="object").columns.tolist())
    ], remainder='passthrough')

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor(
    max_iter=300,
    learning_rate=0.05,
    max_leaf_nodes=31,
    l2_regularization=1.0,
    random_state=42
    ))
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 

In [None]:
# after doing grid search for each param 
# funny but I got actually worth performance 

cat_processor = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
        ('cat', cat_processor, X_train.select_dtypes(include="object").columns.tolist())
    ], remainder='passthrough')

# NEW
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", HistGradientBoostingRegressor(
    max_iter=250,
    learning_rate=0.1,
    max_leaf_nodes=35,
    l2_regularization=1.0,
    random_state=42,
    max_bins = 85
    ))
])

cv = cross_validate(pipeline, X_train, y_train, cv=5, scoring='r2', return_train_score=True)

print("Cross-validation results:")
print("Train R^2 scores:", cv['train_score'])
print("Test R^2 scores:", cv['test_score']) 