In [10]:
import numpy as np
import pandas as pd
import altair as alt
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [11]:
train_df = pd.read_csv('data/train.csv', index_col=0)
test_df = pd.read_csv('data/test.csv')
train_df.head()

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [12]:
# Truncated train_df
TRUNCATE = False

if TRUNCATE:
    train_df = train_df.query(f'Rings < {train_df['Rings'].quantile(0.997)}')

In [13]:
display(train_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 90615 entries, 0 to 90614
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             90615 non-null  object 
 1   Length          90615 non-null  float64
 2   Diameter        90615 non-null  float64
 3   Height          90615 non-null  float64
 4   Whole weight    90615 non-null  float64
 5   Whole weight.1  90615 non-null  float64
 6   Whole weight.2  90615 non-null  float64
 7   Shell weight    90615 non-null  float64
 8   Rings           90615 non-null  int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 6.9+ MB


None

In [14]:
target_col = "Rings"
X_train, y_train = train_df.drop(
    columns=[target_col]), train_df[target_col]
y_train = pd.DataFrame(y_train)
X_test = test_df.copy()

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(90615, 8)
(90615, 1)
(60411, 9)


In [15]:
alt.Chart(y_train,
          title='Histogram of Ring').mark_bar().encode(
    alt.X('Rings:Q').bin(maxbins=40),
    y='count()'
)

In [16]:
y_train.quantile(0.995)

Rings    21.0
Name: 0.995, dtype: float64

In [17]:
train_df.corr(numeric_only=True).round(
    decimals=3).style.background_gradient()

Unnamed: 0,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
Length,1.0,0.99,0.916,0.931,0.91,0.913,0.911,0.624
Diameter,0.99,1.0,0.92,0.934,0.908,0.915,0.918,0.637
Height,0.916,0.92,1.0,0.902,0.862,0.886,0.904,0.666
Whole weight,0.931,0.934,0.902,1.0,0.971,0.974,0.964,0.617
Whole weight.1,0.91,0.908,0.862,0.971,1.0,0.949,0.912,0.515
Whole weight.2,0.913,0.915,0.886,0.974,0.949,1.0,0.937,0.589
Shell weight,0.911,0.918,0.904,0.964,0.912,0.937,1.0,0.695
Rings,0.624,0.637,0.666,0.617,0.515,0.589,0.695,1.0


In [18]:
# imports
import sys, os
import time

import numpy as np
import pandas as pd
import altair as alt
from IPython.display import HTML

sys.path.append(os.path.join(os.path.abspath("."), "code"))

from IPython.display import display

# Classifiers and regressors
from sklearn.dummy import DummyClassifier, DummyRegressor

# Preprocessing and pipeline
from sklearn.impute import SimpleImputer

# train test split and cross validation
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import (
    MinMaxScaler,
    RobustScaler,
    OneHotEncoder,

    
    OrdinalEncoder,
    StandardScaler,
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.svm import *
from lightgbm.sklearn import *
from sklearn.model_selection import *
from xgboost import XGBRegressor

In [19]:
# Function to quicly cross validate different models
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, n_jobs=-1, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" %
                       (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [20]:
categorcial_feats = ['Sex']
numerical_feats = [col for col in X_train.columns if col not in ['Sex']]

categorical_pipe = make_pipeline(OneHotEncoder(drop='if_binary', handle_unknown='ignore'))
numerical_pipe = make_pipeline(RobustScaler(), SimpleImputer(strategy='median'))

preprocessor_with_scaler = make_column_transformer((categorical_pipe, categorcial_feats),
                                                    (numerical_pipe, numerical_feats))

print(categorcial_feats)
print(numerical_feats)
preprocessor_with_scaler

['Sex']
['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']


In [21]:
train_df

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,F,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11
1,F,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11
2,I,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6
3,M,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10
4,I,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9
...,...,...,...,...,...,...,...,...,...
90610,M,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450,6
90611,M,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400,9
90612,I,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815,6
90613,I,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700,6


In [22]:
# create a dictionary for storing model scores
results_dict = {}

In [23]:
xgboost = make_pipeline(preprocessor_with_scaler, XGBRegressor(objective='reg:gamma', # so that the predictions would be non-negative
                                                               random_state=123,
                                                               n_jobs=-1,
                                                               verbosity=0))
results_dict["xgboost"] = mean_std_cross_val_scores(
    xgboost, X_train, y_train, return_train_score=True,
    scoring='neg_root_mean_squared_log_error'
)
pd.DataFrame(results_dict).T

Unnamed: 0,fit_time,score_time,test_score,train_score
xgboost,0.771 (+/- 0.020),0.040 (+/- 0.000),-0.151 (+/- 0.002),-0.137 (+/- 0.001)


In [24]:
TUNING = True

In [25]:
param_grid = {
    "xgbregressor__learning_rate": np.arange(0.01, 0.1, 0.001),
    "xgbregressor__max_depth": np.arange(3, 8, 1),
    "xgbregressor__max_leaves": np.arange(100, 1001, 20),
    "xgbregressor__n_estimators": np.arange(100, 601, 20),
    "xgbregressor__gamma": np.arange(0, 10, 0.5),
    "xgbregressor__lambda": np.arange(0, 100, 0.5),
    "xgbregressor__alpha": np.arange(0, 100, 0.5),
    "xgbregressor__eta": np.arange(0.3, 1, 0.05),
    "xgbregressor__min_child_weight": np.arange(0, 10, 0.5),
    "xgbregressor__max_delta_step": np.arange(0, 10, 1),
    "xgbregressor__subsample": np.arange(0.3, 0.7, 0.02),
}

In [26]:
# Estimate runtime
if TUNING:
    random_search = RandomizedSearchCV(
    xgboost,
    param_distributions=param_grid,
    n_iter=5,
    n_jobs=-1,
    return_train_score=True,
    random_state=123,
    scoring='neg_root_mean_squared_log_error'
    )

    random_search.fit(X_train, y_train)

In [27]:
if TUNING:
    random_search = RandomizedSearchCV(
    xgboost,
    param_distributions=param_grid,
    n_iter=2500,
    n_jobs=-1,
    return_train_score=True,
    random_state=123,
    scoring='neg_root_mean_squared_log_error'
    )

    random_search.fit(X_train, y_train)

In [28]:
if TUNING:
    cv_result_df = pd.DataFrame(random_search.cv_results_)[
        [
            "mean_test_score",
            "param_xgbregressor__learning_rate",
            "param_xgbregressor__max_depth",
            "param_xgbregressor__max_leaves",
            "param_xgbregressor__n_estimators",
            "param_xgbregressor__gamma",
            "param_xgbregressor__lambda",
            "param_xgbregressor__alpha",
            "param_xgbregressor__eta",
            "param_xgbregressor__min_child_weight",
            "param_xgbregressor__max_delta_step",
            "param_xgbregressor__subsample",
            "mean_fit_time",
            "rank_test_score",
        ]
    ].set_index("rank_test_score").sort_index().T

    cv_result_df.to_csv('model/xgboost_cv_result.csv')
    cv_result_df
else:
    cv_result_df = pd.read_csv('model/xgboost_cv_result.csv', index_col=0)

cv_result_df

rank_test_score,1,2,3,4,5,6,7,8,9,10,...,2491,2492,2493,2494,2495,2496,2497,2498,2499,2500
mean_test_score,-0.149462,-0.149556,-0.149689,-0.149951,-0.14996,-0.14997,-0.150179,-0.150356,-0.150383,-0.150389,...,-0.495117,-0.495772,-0.508348,-0.535022,-0.535418,-0.539524,-0.540743,-0.545073,-0.579062,-0.617962
param_xgbregressor__learning_rate,0.097,0.074,0.097,0.095,0.043,0.085,0.048,0.086,0.089,0.064,...,0.012,0.012,0.014,0.011,0.011,0.011,0.011,0.013,0.012,0.011
param_xgbregressor__max_depth,7.0,7.0,7.0,5.0,6.0,5.0,6.0,5.0,6.0,4.0,...,3.0,3.0,7.0,6.0,6.0,3.0,6.0,5.0,5.0,3.0
param_xgbregressor__max_leaves,380.0,320.0,340.0,140.0,840.0,380.0,820.0,300.0,200.0,660.0,...,500.0,100.0,520.0,660.0,540.0,480.0,200.0,240.0,860.0,580.0
param_xgbregressor__n_estimators,600.0,400.0,200.0,580.0,360.0,260.0,600.0,400.0,360.0,540.0,...,120.0,120.0,100.0,120.0,120.0,120.0,120.0,100.0,100.0,100.0
param_xgbregressor__gamma,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,5.0,4.5,2.0,1.5,0.5,3.0,3.0,5.0,7.5
param_xgbregressor__lambda,80.0,55.0,90.0,11.5,71.0,62.5,72.5,69.5,68.0,79.5,...,78.0,20.5,45.0,51.5,86.0,85.5,13.5,55.0,74.0,82.0
param_xgbregressor__alpha,8.5,6.5,1.5,16.5,3.5,2.0,17.0,15.5,19.0,11.0,...,35.5,86.5,36.0,17.0,11.0,84.0,97.5,82.5,66.0,46.0
param_xgbregressor__eta,0.55,0.65,0.5,0.7,0.7,0.95,0.9,0.6,0.4,0.7,...,0.55,0.95,0.75,0.35,0.85,0.35,0.65,0.45,0.5,0.7
param_xgbregressor__min_child_weight,7.0,1.0,3.0,2.5,5.5,4.5,3.0,3.5,4.0,9.0,...,8.0,1.0,0.0,1.0,0.0,1.0,2.5,3.0,2.0,7.0


In [29]:
fig_hyperparam = alt.Chart(cv_result_df.T).mark_point(clip=True).encode(
    x = alt.X('mean_test_score'), #.scale(domain=(0.58, 0.7)),
    y=alt.Y(alt.repeat("row"), type='quantitative')
).properties(
    width=900
).repeat(
    row=cv_result_df.T.columns.to_list()[1:-1]
)

fig_hyperparam

In [30]:
import pickle

if TUNING:
    xgboost_opt = make_pipeline(random_search.best_estimator_)
else:
    with open('model/xgboost_opt.pkl', 'rb') as file:
        xgboost_opt = pickle.load(file)   
    xgboost_opt = make_pipeline(xgboost_opt)
                            
results_dict["xgboost optimized"] = mean_std_cross_val_scores(
    xgboost_opt, X_train, y_train, return_train_score=True,
    scoring='neg_root_mean_squared_log_error'
)
pd.DataFrame(results_dict).T

Unnamed: 0,fit_time,score_time,test_score,train_score
xgboost,0.771 (+/- 0.020),0.040 (+/- 0.000),-0.151 (+/- 0.002),-0.137 (+/- 0.001)
xgboost optimized,4.747 (+/- 0.029),0.264 (+/- 0.002),-0.149 (+/- 0.002),-0.144 (+/- 0.000)


In [31]:
# Fit the optimized model
xgboost_opt.fit(X_train, y_train)

if TUNING:
    # Save the model to a pickle file
    with open('model/xgboost_opt_gamma_truncated_0.99.pkl', 'wb') as file:
        pickle.dump(xgboost_opt, file)

xgboost_opt

In [32]:
xgboost_opt.predict(X_test).max()

19.934395

In [33]:
xgboost_opt.predict(X_train)

array([10.197204 , 10.837562 ,  4.0627966, ...,  6.951442 ,  6.8155866,
        7.5976353], dtype=float32)

In [34]:
pred = X_test.copy()
pred['Rings'] = xgboost_opt.predict(X_test)
pred[['Rings']].shape

(60411, 1)

In [35]:
pred[['id', 'Rings']].to_csv('prediction/xgboost_opt_gamma_truncated_0.997.csv', index=False)