## Top Models
- Train many quick and dirty models
- Measure and compare performance using K-Fold cross-validation
- Shortlist the top four most promising models

In [1]:
import sys
sys.path.append("..")

In [2]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from src.load import load_diamonds
from src.conf import CATEGORIES, CATEGORIES_DEPTH_TABLE
from src.splitters import split_train_test, split_X_y
from src.transformers import CatEncoder

In [3]:
# Load and split dataset
diamonds = load_diamonds()
diamonds, _ = split_train_test(diamonds)
X_train, y_train = split_X_y(diamonds)

In [4]:
# Define preprocessor
depth_pipe = Pipeline([
    ("cat_enc", CatEncoder("depth")),
    ("ordinal_enc", OrdinalEncoder(categories=CATEGORIES_DEPTH_TABLE))
])

table_pipe = Pipeline([
    ("cat_enc", CatEncoder("table")),
    ("ordinal_enc", OrdinalEncoder(categories=CATEGORIES_DEPTH_TABLE))
])

preprocessor_1 = ColumnTransformer([
        ("num", StandardScaler(), ["carat", "x", "y", "z"]),
        ("cat", OrdinalEncoder(categories=CATEGORIES), ["cut", "color", "clarity"]),
        ("depth", depth_pipe, "depth"),
        ("table", table_pipe, "table")
])

preprocessor_2 = ColumnTransformer([
        ("num", StandardScaler(), ["carat", "x", "y", "z"]),
        ("cat", OrdinalEncoder(categories=CATEGORIES), ["cut", "color", "clarity"]),
])

In [5]:
# Short test with LinearRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg_1 = Pipeline([
    ("preprocessor", preprocessor_1),
    ("regressor", LinearRegression()) 
])

lin_reg_2 = Pipeline([
    ("preprocessor", preprocessor_2),
    ("regressor", LinearRegression()) 
])

models = {
    "preprocessed": lin_reg_1,
    "normal": lin_reg_2,
}

In [6]:
print("RMSE".center(80, "-"))
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    rmse = np.sqrt(mean_squared_error(y_train, y_pred))
    print(f"{model_name}: {rmse}")

--------------------------------------RMSE--------------------------------------
preprocessed: 1207.325241152875
normal: 1213.3749853057368


In [7]:
# Try other models, this time with k-fold cross validation
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier

models = {
    "linear": Pipeline([
        ("preprocessor", preprocessor_1),
        ("regressor", LinearRegression())
        ]),

    "ridge": Pipeline([
        ("preprocessor", preprocessor_1),
        ("regressor", Ridge())
        ]),

    "kneighbors": Pipeline([
        ("preprocessor", preprocessor_1),
        ("regressor", KNeighborsRegressor()) 
        ]),

    "tree": Pipeline([
        ("preprocessor", preprocessor_1),
        ("regressor", DecisionTreeRegressor()) 
        ]),
}

In [8]:
for model_name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
    scores = -scores
    print(f"{model_name}".center(80, "-"))
    print(f"mean RMSE: {scores.mean()}")
    print(f"std RMSE: {scores.std()}")

-------------------------------------linear-------------------------------------
mean RMSE: 1262.24319811333
std RMSE: 125.46608457984283
-------------------------------------ridge--------------------------------------
mean RMSE: 1258.8673928680375
std RMSE: 118.91616355090899
-----------------------------------kneighbors-----------------------------------
mean RMSE: 737.6823173216465
std RMSE: 35.56898551605093
--------------------------------------tree--------------------------------------
mean RMSE: 766.3503902319837
std RMSE: 17.40123299277233


In [9]:
from src.load import load_predict

In [10]:
X_pred = load_predict()

In [11]:
X_pred

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.33,Very Good,I,IF,62.0,58.0,4.44,4.46,2.76
1,1.21,Very Good,D,SI2,62.4,58.0,6.77,6.83,4.24
2,1.06,Very Good,D,SI1,59.3,60.0,6.64,6.71,3.96
3,0.36,Ideal,E,VVS1,61.4,57.0,4.64,4.61,2.54
4,0.70,Ideal,E,VS1,62.3,54.0,5.67,5.72,3.55
...,...,...,...,...,...,...,...,...,...
13480,2.04,Good,D,SI1,61.9,60.0,8.15,8.11,5.03
13481,0.31,Ideal,G,IF,61.7,55.0,4.37,4.39,2.70
13482,0.52,Ideal,D,VVS2,62.0,56.0,5.14,5.18,3.20
13483,0.50,Premium,F,SI1,59.1,58.0,5.23,5.19,3.08


In [14]:
knn = models["kneighbors"]
knn.fit(X_train, y_train)

Pipeline(steps=[(&#39;preprocessor&#39;,
                 ColumnTransformer(transformers=[(&#39;num&#39;, StandardScaler(),
                                                  [&#39;carat&#39;, &#39;x&#39;, &#39;y&#39;, &#39;z&#39;]),
                                                 (&#39;cat&#39;,
                                                  OrdinalEncoder(categories=[[&#39;Fair&#39;,
                                                                              &#39;Good&#39;,
                                                                              &#39;Very &#39;
                                                                              &#39;Good&#39;,
                                                                              &#39;Premium&#39;,
                                                                              &#39;Ideal&#39;],
                                                                             [&#39;J&#39;,
                                          

In [17]:
import pandas as pd

In [32]:
y_pred = pd.DataFrame(knn.predict(X_pred), columns=["price"])
y_pred.index.name = "id"

In [34]:
y_pred.to_csv("../submissions/submission-01.csv")

In [33]:
y_pred

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,882.4
1,5160.2
2,5577.2
3,1008.6
4,3307.2
...,...
13480,16852.0
13481,880.6
13482,2715.4
13483,1389.2
