## Goal of this notebook: reaching a score below 1.37 on the crab competition
Many entries in the leaderboard are around 1.33


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
import xgboost as xg

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

Read synthetic data to enlarge the training dataset

In [2]:
df_train = pd.read_csv('/kaggle/input/playground-series-s3e16/train.csv')
df_synth = pd.read_csv('/kaggle/input/make-synthetic-crab-age-data/synthetic_data.csv')

df_combined = pd.concat((df_train, df_synth))
df_combined.head()
# sex_ohe = OneHotEncoder().fit_transform(df_combined[['Sex']]) # why not to use this one?
for s in 'MFI':
    df_combined[f'is_{s}'] = df_combined.Sex == s
df_combined.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age,is_M,is_F,is_I
0,0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9.0,False,False,True
1,1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8.0,False,False,True
2,2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9.0,True,False,False
3,3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11.0,False,True,False
4,4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8.0,False,False,True


In [3]:
df_synth.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,F,1.55,1.2,0.4125,26.435909,8.320578,5.244657,9.029316,20.0
1,1,I,1.3625,1.1,0.375,21.729892,9.525432,4.819415,6.80388,9.0
2,2,I,0.65,0.5,0.175,2.721552,1.048931,0.524466,0.850485,5.0
3,3,M,1.3625,1.0125,0.35,18.809893,7.93786,4.124852,5.953395,9.0
4,4,I,1.1,0.775,0.2875,10.276694,4.946988,2.197086,2.83495,7.0


In [4]:
df_train.Sex.unique()

array(['I', 'M', 'F'], dtype=object)

In [5]:

df_train_dv = df_train
categories = [('Sex',['F','I','M'])]
ohe_columns = [x[0] for x in categories]
ohe_categories = [x[1] for x in categories]
enc = OneHotEncoder(sparse_output=False, categories=ohe_categories)

# We create a column transformer telling it to replace the columns which hold the categories and leave the rest untouched.
# The column transformer does not create the pandas DataFrame, but it selects the appropriate columns, converts them and appends the converted columns to the other ones.
transformer = make_column_transformer((enc, ohe_columns), remainder='passthrough')

# We convert the resulting arrays to DataFrames
transformed=transformer.fit_transform(df_train_dv)

df_train_dv = pd.DataFrame(transformed, columns=transformer.get_feature_names_out(),index=df_train_dv.index)



In [6]:
y = df_train_dv.remainder__Age
y.head()

0     9.0
1     8.0
2     9.0
3    11.0
4     8.0
Name: remainder__Age, dtype: float64

In [7]:
X = df_train_dv.drop(["remainder__Age","remainder__id"],axis=1)
X.head()
#X.shape

Unnamed: 0,onehotencoder__Sex_F,onehotencoder__Sex_I,onehotencoder__Sex_M,remainder__Length,remainder__Diameter,remainder__Height,remainder__Weight,remainder__Shucked Weight,remainder__Viscera Weight,remainder__Shell Weight
0,0.0,1.0,0.0,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928
1,0.0,1.0,0.0,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194
2,0.0,0.0,1.0,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133
3,1.0,0.0,0.0,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885
4,0.0,1.0,0.0,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395


In [8]:
X.head()


Unnamed: 0,onehotencoder__Sex_F,onehotencoder__Sex_I,onehotencoder__Sex_M,remainder__Length,remainder__Diameter,remainder__Height,remainder__Weight,remainder__Shucked Weight,remainder__Viscera Weight,remainder__Shell Weight
0,0.0,1.0,0.0,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928
1,0.0,1.0,0.0,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194
2,0.0,0.0,1.0,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133
3,1.0,0.0,0.0,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885
4,0.0,1.0,0.0,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395


In [9]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)


In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
age_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, age_preds))

1.458411386593205


In [11]:


# Instantiation
xgb_r = xg.XGBRegressor(objective ='reg:squarederror',
                  n_estimators = 9, seed = 123)
  
# Fitting the model
xgb_r.fit(train_X, train_y)
  
# Predict the model
pred_xgb = xgb_r.predict(val_X)
print(mean_absolute_error(val_y, pred_xgb))

1.369068211071406


In [12]:
# Train and test set are converted to DMatrix objects,
# as it is required by learning API.
train_dmatrix = xg.DMatrix(data = train_X, label = train_y)
test_dmatrix = xg.DMatrix(data = val_X, label = val_y)
  
# Parameter dictionary specifying base learner
param = {"booster":"gblinear", "objective":"reg:squarederror", 'alpha': 10,'learning_rate': 1.0}
  
xgb_r = xg.train(params = param, dtrain = train_dmatrix, num_boost_round = 100)
pred = xgb_r.predict(test_dmatrix)

print(mean_absolute_error(val_y, pred))

1.9097806590499635


In [13]:
# #=========================================================================
# # XGBoost regression: 
# # Parameters: 
# # n_estimators  "Number of gradient boosted trees. Equivalent to number 
# #                of boosting rounds."
# # learning_rate "Boosting learning rate (also known as “eta”)"
# # max_depth     "Maximum depth of a tree. Increasing this value will make 
# #                the model more complex and more likely to overfit." 
# #=========================================================================
# regressor=xg.XGBRegressor(eval_metric='mae')

# #=========================================================================
# # exhaustively search for the optimal hyperparameters
# #=========================================================================
# from sklearn.model_selection import GridSearchCV
# # set up our search grid
# param_grid = {"max_depth":    [6, 10],
#               "n_estimators": [700, 1000],
#               "learning_rate": [0.015, 0.2, 0.4, 0.8]}

# # try out every combination of the above values
# search = GridSearchCV(regressor, param_grid, cv=5).fit(train_X, train_y)

# print("The best hyperparameters are ",search.best_params_)

In [14]:
# regressor=xgb.XGBRegressor(learning_rate = search.best_params_["learning_rate"],
#                            n_estimators  = search.best_params_["n_estimators"],
#                            max_depth     = search.best_params_["max_depth"],
#                            eval_metric='mae')

# regressor.fit(train_X, train_y)

# #=========================================================================
# # To use early_stopping_rounds: 
# # "Validation metric needs to improve at least once in every 
# # early_stopping_rounds round(s) to continue training."
# #=========================================================================
# # first perform a test/train split 
# #from sklearn.model_selection import train_test_split

# #X_train,X_test,y_train,y_test = train_test_split(X_train,y_train, test_size = 0.2)
# #regressor.fit(X_train, y_train, early_stopping_rounds=6, eval_set=[(X_test, y_test)], verbose=False)

# #=========================================================================
# # use the model to predict the prices for the test data
# #=========================================================================
# predictions_best_hyper_params = regressor.predict(val_X)
# print(mean_absolute_error(val_y, predictions_best_hyper_params))

In [15]:
# from xgboost import plot_importance
# import matplotlib.pyplot as plt
# plt.style.use('fivethirtyeight')
# plt.rcParams.update({'font.size': 16})

# fig, ax = plt.subplots(figsize=(12,6))
# plot_importance(
#     , max_num_features=8, ax=ax)
# plt.show();

Build a pipeline for cross validation on the training dataset

In [16]:
df_inp = df_combined
X_train = df_inp[['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight', 
                  'is_M', 'is_F', 'is_I']].to_numpy()
y_train = df_inp['Age'].to_numpy()
kfold = StratifiedKFold(10, shuffle=True).split(X_train, y_train)
my_tree = DecisionTreeClassifier(max_depth=8)
scores = []
for k, (train, test) in enumerate(kfold):
    my_tree.fit(X_train[train], y_train[train])
    out = my_tree.predict(X_train[test])
    #out = np.round(out/) # consider adding a small constant bias because the error seems to be biased, or optimizing the division factor
    out = np.round(out) # consider adding a small constant bias because the error seems to be biased, or optimizing the division factor
    score = np.abs(out - y_train[test]).mean()
    scores.append(score)
print(scores)
print(np.mean(scores))


[1.4665600852651213, 1.4570286475682879, 1.441039307128581, 1.4668887408394404, 1.4491672218520986, 1.4305129913391073, 1.4670219853431046, 1.4421052631578948, 1.4450366422385077, 1.4591605596269155]
1.452452144435906
