## Top Models
- Train many quick and dirty models
- Measure and compare performance using K-Fold cross-validation
- Shortlist the top four most promising models

In [1]:
import sys
sys.path.append("..")

In [2]:
import numpy as np
from itertools import product
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from src.load import load_diamonds
from src.conf import CATEGORIES, CATEGORIES_DEPTH_TABLE
from src.splitters import split_train_test, split_X_y
from src.transformers import AttributeAdder, CatEncoder

In [3]:
# Load and split dataset
diamonds = load_diamonds()
diamonds, _ = split_train_test(diamonds)
X_train, y_train = split_X_y(diamonds)

In [5]:
# Define different pipelines for the categories
num_pipe = {
    "adder": Pipeline([
        ("attr_adder", AttributeAdder()),
        ("std_scaler", StandardScaler())
    ]),

    "no_adder": StandardScaler()
}

depth_pipe = {
    "ord_enc": Pipeline([
        ("cat_enc", CatEncoder("depth")),
        ("ordinal_enc", OrdinalEncoder(categories=CATEGORIES_DEPTH_TABLE)),
        ("std_scaler", StandardScaler())
    ]),

    "1_hot_enc": Pipeline([
        ("cat_enc", CatEncoder("depth")),
        ("1_hot_enc", OneHotEncoder(drop="first"))
    ])
}

table_pipe = {
    "ord_enc": Pipeline([
        ("cat_enc", CatEncoder("table")),
        ("ordinal_enc", OrdinalEncoder(categories=CATEGORIES_DEPTH_TABLE)),
        ("std_scaler", StandardScaler())
    ]),

    "1_hot_enc": Pipeline([
        ("cat_enc", CatEncoder("table")),
        ("1_hot_enc", OneHotEncoder(drop="first"))
    ])
}

other_cat_pipe = {
    "ord_enc": Pipeline([
        ("ordinal_enc", OrdinalEncoder(categories=CATEGORIES)),
        ("std_scaler", StandardScaler())
    ]),

    "1_hot_enc": OneHotEncoder(drop="first")
}

In [6]:
# Define different preprocessors
preprocessors = {
    "no_depth_no_table": ColumnTransformer([
        ("num", StandardScaler(), ["carat", "x", "y", "z"]),
        ("cat", other_cat_pipe["1_hot_enc"], ["cut", "color", "clarity"]),
    ])
}


for add, enc in product(num_pipe.keys(), depth_pipe.keys()):
    k = "_".join([add, enc])
    preprocessors[k] = ColumnTransformer([
        ("num", num_pipe[add], ["carat", "x", "y", "z"]),
        ("depth", depth_pipe[enc], "depth"),
        ("table", table_pipe[enc], "table"),
        ("cat", other_cat_pipe[enc], ["cut", "color", "clarity"]),
    ])

In [8]:
# Short test with LinearRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

linear_models = {}
for k in preprocessors.keys():
    name = f"Linear Regression with preprocessor {k}"
    linear_models[name] = Pipeline([
        ("preprocessor", preprocessors[k]),
        ("regressor", LinearRegression())
    ])

print("RMSE".center(80, "-"))
for model_name, model in linear_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    rmse = mean_squared_error(y_train, y_pred, squared=False)
    print(f"{model_name}: {rmse}")

--------------------------------------RMSE--------------------------------------
Linear Regression with preprocessor no_depth_no_table: 1124.589384067514
Linear Regression with preprocessor adder_ord_enc: 1189.9444030775858
Linear Regression with preprocessor adder_1_hot_enc: 1102.988928015618
Linear Regression with preprocessor no_adder_ord_enc: 1208.7301046963994
Linear Regression with preprocessor no_adder_1_hot_enc: 1122.1424922001447


In [None]:
# Best one is adder_1_hot_enc (for Linear Regression)

In [9]:
# Try other models, this time with k-fold cross validation
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

models = {}
for prep_name, preprocessor in preprocessors.items():
    models[prep_name] = {
        "linear": Pipeline([
            ("preprocessor", preprocessor),
            ("regressor", LinearRegression())
        ]),
    
        "ridge": Pipeline([
            ("preprocessor", preprocessor),
            ("regressor", Ridge())
        ]),
    
        "elasticnet": Pipeline([
            ("preprocessor", preprocessor),
            ("regressor", ElasticNet())
        ]),
    
        "kneighbors": Pipeline([
            ("preprocessor", preprocessor),
            ("regressor", KNeighborsRegressor()) 
        ]),
    
        "tree": Pipeline([
            ("preprocessor", preprocessor),
            ("regressor", DecisionTreeRegressor()) 
        ]),
    
        "forest": Pipeline([
            ("preprocessor", preprocessor),
            ("regressor", RandomForestRegressor())
        ])
    }

In [10]:
for prep_name, model_dict in models.items():
    print(f"{prep_name}".center(80, "_"))
    for model_name, model in model_dict.items():
        print(f"{model_name}".center(80, "-"))
        model.fit(X_train, y_train)
        y_pred = model.predict(X_train)
        rmse = mean_squared_error(y_train, y_pred, squared=False)
        scores = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=2, n_jobs=-1)
        scores = -scores
        print(f"Training set, RMSE: {rmse:.2f}")
        print(f"Cross-val, mean RMSE: {scores.mean():.2f}")
        print(f"Cross-val, std RMSE: {scores.std():.2f}")

_______________________________no_depth_no_table________________________________
-------------------------------------linear-------------------------------------
Training set, RMSE: 1124.59
Cross-val, mean RMSE: 1200.75
Cross-val, std RMSE: 65.65
-------------------------------------ridge--------------------------------------
Training set, RMSE: 1124.61
Cross-val, mean RMSE: 1194.93
Cross-val, std RMSE: 59.83
-----------------------------------elasticnet-----------------------------------
Training set, RMSE: 1738.71
Cross-val, mean RMSE: 1741.37
Cross-val, std RMSE: 2.46
-----------------------------------kneighbors-----------------------------------
Training set, RMSE: 562.19
Cross-val, mean RMSE: 799.77
Cross-val, std RMSE: 7.76
--------------------------------------tree--------------------------------------
Training set, RMSE: 17.09
Cross-val, mean RMSE: 975.74
Cross-val, std RMSE: 4.31
-------------------------------------forest-------------------------------------
Training set, RM

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

### Conclusions
The most promising models are 