In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
file_path = "train.csv"
orig_data_path = "abalone.csv"

df = pd.read_csv(file_path,index_col='id')
orig_data = pd.read_csv(orig_data_path)

In [3]:
df.columns

Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1',
       'Whole weight.2', 'Shell weight', 'Rings'],
      dtype='object')

In [4]:
columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']

df.columns = orig_data.columns = columns

df = pd.concat([df,orig_data])
df.reset_index(inplace=True, drop=True)

In [5]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94792 entries, 0 to 94791
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             94792 non-null  object 
 1   Length          94792 non-null  float64
 2   Diameter        94792 non-null  float64
 3   Height          94792 non-null  float64
 4   Whole weight    94792 non-null  float64
 5   Shucked weight  94792 non-null  float64
 6   Viscera weight  94792 non-null  float64
 7   Shell weight    94792 non-null  float64
 8   Rings           94792 non-null  int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 6.5+ MB


In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Length,94792.0,0.517402,0.118308,0.075,0.445,0.545,0.6,0.815
Diameter,94792.0,0.401952,0.098088,0.055,0.345,0.425,0.47,0.65
Height,94792.0,0.135643,0.038193,0.0,0.11,0.14,0.16,1.13
Whole weight,94792.0,0.790785,0.459231,0.002,0.4195,0.7995,1.0725,2.8255
Shucked weight,94792.0,0.341597,0.205267,0.001,0.1775,0.33,0.4645,1.488
Viscera weight,94792.0,0.169914,0.101334,0.0005,0.0865,0.166,0.233,0.76
Shell weight,94792.0,0.226468,0.130639,0.0015,0.12,0.225,0.305,1.005
Rings,94792.0,9.707233,3.178704,1.0,8.0,9.0,11.0,29.0


In [8]:
no_of_zero_height_rows = len(np.where((df['Height']<=0))[0])
no_of_zero_height_rows

8

In [9]:
df = df[(df['Height']>0)].reset_index(drop=True)

In [10]:
num_features = df.drop('Rings',axis=1).select_dtypes(exclude=object).columns
num_features

Index(['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight'],
      dtype='object')

In [11]:
from scipy.stats import zscore

z_score = abs(zscore(df[num_features]))

In [12]:
len(np.where(z_score>3)[0]) / len(df) * 100

2.1912981093855506

In [13]:
df = df[(z_score < 3).all(axis=1)].reset_index(drop=True)

In [14]:
df.shape

(93394, 9)

In [15]:
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OrdinalEncoder, PowerTransformer, OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer,make_column_selector

boxcox_pipe = make_pipeline(
    PowerTransformer(method='box-cox'),
    StandardScaler()
)

sqrt_pipe = make_pipeline(
    FunctionTransformer(lambda x:np.sqrt(x),feature_names_out='one-to-one'),
    StandardScaler()
)

preprocessing = ColumnTransformer([
    ('ohe',OneHotEncoder(), ['Sex']),
    ('sqrt',sqrt_pipe,['Length','Diameter']),
    ('std_scaler',StandardScaler(),['Height']),
    ('boxcox',boxcox_pipe,make_column_selector(pattern='^.*weight.*$'))
],remainder='drop')

In [16]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X = df.drop('Rings',axis=1)
y = df['Rings']

In [17]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.model_selection import cross_val_score

In [18]:
%%time

lgb_reg = lgb.LGBMRegressor(verbose=-1)
lgb_pipe = make_pipeline(preprocessing,lgb_reg)

score = -cross_val_score(lgb_pipe,X,y.astype('float'),cv=skfold,scoring='neg_mean_squared_log_error').mean()

score**0.5

CPU times: user 7.87 s, sys: 225 ms, total: 8.09 s
Wall time: 5.32 s


0.1508399605900386

In [19]:
%%time

cat_reg = cat.CatBoostRegressor(verbose=False,allow_writing_files=False)
cat_pipe = make_pipeline(preprocessing,cat_reg)

score = -cross_val_score(cat_pipe,X,y.astype('float'),cv=5,scoring='neg_mean_squared_log_error').mean()

score**0.5

CPU times: user 2min 52s, sys: 7.12 s, total: 2min 59s
Wall time: 31.5 s


0.14986470610868322

In [20]:
%%time

xgb_reg = xgb.XGBRegressor(random_state=42)
xgb_pipe = make_pipeline(preprocessing,xgb_reg)
score = -cross_val_score(xgb_pipe,X,y.astype('float'),cv=5,scoring='neg_mean_squared_log_error').mean()

score**0.5

CPU times: user 13.6 s, sys: 117 ms, total: 13.7 s
Wall time: 6.18 s


0.15146970879971014

In [21]:
%%time


hgb_reg = HistGradientBoostingRegressor()
hgb_pipe = make_pipeline(preprocessing,hgb_reg)

score = -cross_val_score(hgb_pipe,X,y.astype('float'),cv=5,scoring='neg_mean_squared_log_error').mean()

score**0.5

CPU times: user 12.7 s, sys: 62.7 ms, total: 12.8 s
Wall time: 6.92 s


0.1512351230662329

In [22]:
import optuna
from optuna.samplers import TPESampler

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
# def objective(trial):
#     params = {
#         'n_estimators':trial.suggest_int('n_estimators',100,1000),
#         'eta':trial.suggest_float('eta',0.001,0.1,log=True),
#         'max_depth':trial.suggest_int('max_depth',1,32),
#         'gamma':trial.suggest_float('gamma',1e-9,1.0),
#         'subsample':trial.suggest_float('subsample',0.2,1.0),
#         'colsample_bytree':trial.suggest_float('colsample_bytree',0.1,1.0),
#         'max_leaves':trial.suggest_int('max_leaves',1,32),
#         'grow_policy':trial.suggest_categorical('grow_policy',["depthwise","lossguide"]),
#     }
    
#     xgb_reg = xgb.XGBRegressor(**params,n_jobs=-1)
#     xgb_pipe = make_pipeline(preprocessing,xgb_reg)
#     score = -cross_val_score(xgb_pipe,X,y.astype('float'),cv=skfold,scoring='neg_mean_squared_log_error').mean()
#     return score**0.5

# storage = 'sqlite:///xgb_reg.db'
# study_name = 'xgb_reg'

# study = optuna.create_study(storage=storage,
#                             study_name = study_name,
#                             direction='minimize',
#                             sampler=TPESampler(),
#                             load_if_exists=True)

# study.optimize(objective,n_trials=100)

# print(f'Best Params: {study.best_params}')

In [24]:
# def objective(trial):
#     params = {
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
#         'num_leaves': trial.suggest_int('num_leaves', 31, 127),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
#         'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
#         'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
#     }
    
#     lgb_reg = lgb.LGBMRegressor(**params,verbose=-1)
#     lgb_pipe = make_pipeline(preprocessing, lgb_reg)
    
#     score = -cross_val_score(lgb_pipe,X,y.astype('float'),cv=skfold,scoring='neg_mean_squared_log_error').mean()
#     return score**0.5

# storage = 'sqlite:///lgb_reg.db'
# study_name = 'lgb_reg'

# study = optuna.create_study(storage=storage,
#                             study_name = study_name,
#                             direction='minimize',
#                             sampler=TPESampler(),
#                             load_if_exists=True)

# study.optimize(objective,n_trials=200)

# print(f"Best Params: {study.best_params}")

In [25]:
# def objective(trial):
#     params = {
#         'learning_rate':trial.suggest_float('learning_rate',0.001,0.1),
#         'max_iter':trial.suggest_int('max_iter',400,1200),
#         'max_leaf_nodes':trial.suggest_int('max_leaf_nodes',3,31),
#         'min_samples_leaf':trial.suggest_int('min_samples_leaf',20,80),
#         'max_depth':trial.suggest_int('max_depth',2,31),
#     }
    
#     hgb_reg = HistGradientBoostingRegressor(**params,)
#     hgb_pipe = make_pipeline(preprocessing,hgb_reg)
#     score = -cross_val_score(hgb_pipe,X,y,cv=skfold,scoring='neg_mean_squared_log_error').mean()
    
#     return score**0.5

# storage = 'sqlite:///hgb_reg.db'
# study_name = 'hgb_reg'

# study = optuna.create_study(storage=storage,
#                             study_name = study_name,
#                             direction='minimize',
#                             sampler=TPESampler(),
#                             load_if_exists=True)

# study.optimize(objective,n_trials=100,n_jobs=-1)

# print(f"Best Params: {study.best_params}")

In [26]:
# Values of hyperparameters for different models found after doing hyperparameter tuning using optuna.

xgb_params = {'n_estimators': 930, 
              'eta': 0.03559492336703499, 
              'max_depth': 31, 
              'gamma': 0.02459147647960671, 
              'subsample': 0.8734365336431481, 
              'colsample_bytree': 0.7738113845191218, 
              'max_leaves': 32, 
              'grow_policy': 'depthwise'} # 0.1498224873039473

lgb_params = {'learning_rate': 0.0951532045161572, 
              'num_leaves': 125, 
              'subsample': 0.5453288010142872, 
              'colsample_bytree': 0.5060077766853301, 
              'min_child_samples': 20, 
              'reg_alpha': 6.869195185341228, 
              'reg_lambda': 9.834768879372454,
              'verbose':-1} # 0.14945279261889158

hgb_params = {'learning_rate': 0.05272923028939806, 
              'max_iter': 964, 
              'max_leaf_nodes': 30, 
              'min_samples_leaf': 75, 
              'max_depth': 17} # 0.15042496734142297

In [27]:
cv_estimators=[
    ('xgb_reg',xgb.XGBRegressor(**xgb_params,n_jobs=-1)),
    ('cat_reg',cat_reg),
    ('lgb_reg',lgb.LGBMRegressor(**lgb_params)),
    ('hgb_reg',HistGradientBoostingRegressor(**hgb_params)),
]

In [28]:
# def objective(trial):
    
#     cv_weights = {
#         'xgb':trial.suggest_float('xgb',0.0,5.0),
#         'cat':trial.suggest_float('cat',0.0,5.0),
#         'lgb':trial.suggest_float('lgb',0.0,5.0),
#         'hgb':trial.suggest_float('hgb',0.0,5.0)
#     }

#     voting_reg = VotingRegressor(estimators = cv_estimators,
#                                 weights = list(cv_weights.values()))
#     voting_pipe = make_pipeline(preprocessing,voting_reg)
#     score = -cross_val_score(voting_pipe,X,y.astype('float'),cv=skfold,scoring='neg_mean_squared_log_error').mean()

#     return score**0.5

# study_name = 'voting_reg'
# storage_name = 'sqlite:///voting_reg.db'

# study = optuna.create_study(storage=storage,
#                             study_name = study_name,
#                             direction='minimize',
#                             sampler=TPESampler(),
#                             load_if_exists=True)

# study.optimize(objective,n_trials=30)

# print(f"Best Params: {study.best_params}")

In [29]:
voting_weights = {'xgb': 1.3289247484328703, 
                  'cat': 2.615183672816546, 
                  'lgb': 4.842304034364629, 
                  'hgb': 0.19767973325946345} # 0.1490903401657186

In [30]:
from sklearn.ensemble import VotingRegressor

voting_reg = VotingRegressor(estimators = cv_estimators,
                                weights = list(voting_weights.values()))

In [31]:
model = Pipeline([
    ('preprocessing',preprocessing),
    ('voting_reg',voting_reg)
])

In [32]:
%%time

model.fit(X,y)

CPU times: user 1min, sys: 1.4 s, total: 1min 1s
Wall time: 10.4 s


In [33]:
test_data = pd.read_csv('test.csv',index_col='id')

In [34]:
test_data.columns = columns[:-1]

In [35]:
pred = model.predict(test_data)

In [36]:
pred

array([ 9.68369436,  9.79290538, 10.05286464, ..., 12.32352146,
       13.50785796,  8.51235149])

In [37]:
sub = pd.DataFrame()
sub['id'] = test_data.index
sub['Rings'] = pred

sub.to_csv('submission.csv',index=False)