In [1]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import sklearn

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

import optuna

/kaggle/input/playground-series-s4e4/sample_submission.csv
/kaggle/input/playground-series-s4e4/train.csv
/kaggle/input/playground-series-s4e4/test.csv


In [2]:
test_df=pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')
train_df=pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')

In [3]:
def preproc(df):
    scaler = StandardScaler()

    df_no_id=df.drop('id', axis=1)
    df_no_sex=df_no_id.drop(columns=['Sex'])
    df_scaled=pd.DataFrame(scaler.fit_transform(df_no_sex), columns = df_no_sex.columns)
    df_scaled['Sex'] = df_no_id['Sex']
    df_dummies=pd.get_dummies(df_scaled)
    df_encoded=df_dummies
    
    return df_encoded

In [4]:
train_df_preproc=preproc(train_df)
test_df_preproc=preproc(test_df)

In [5]:
target= np.log1p(train_df['Rings'])


In [6]:
X=train_df_preproc.drop(columns='Rings')
X=X.astype(np.float32)


In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, 
                                                  target, 
                                                  test_size=.2,  
                                                  random_state = 42)

In [8]:
def rmsle(y_true, y_pred):
    rmsle = np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
    return rmsle

In [9]:
"""
#Stacking

from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import numpy as np

top_params = {
    'HistGradientBoostingRegressor': {'learning_rate' : 0.06418641358088736, 'max_depth' : 9,
                              'max_iter' : 854, 'max_leaf_nodes' : 39,
                              'min_samples_leaf': 7},
    'MLPRegressor': {'hidden_layer_sizes': (68, 18, 56), 'activation': 'relu', 'alpha': 5.0753793080466134e-05, 'learning_rate_init': 0.0032188937689512057},
    'RandomForestRegressor': {'max_depth': 19, 'max_features': 'log2', 'min_samples_split': 8, 'min_samples_leaf': 2, 'n_estimators' : 770, 'random_state' : 42},
    'ExtraTreesRegressor': {'n_estimators': 272, 'max_features': None, 'min_samples_split': 14, 'max_depth': 20}
}



# Define the base models with the tuned parameters
base_models = [
    ('HistGradientBoostingRegressor', HistGradientBoostingRegressor(**top_params['HistGradientBoostingRegressor'])),
    ('MLPRegressor', MLPRegressor(**top_params['MLPRegressor'])),
    ('RandomForestRegressor', RandomForestRegressor(**top_params['RandomForestRegressor'])),
    ('ExtraTreesRegressor', ExtraTreesRegressor(**top_params['ExtraTreesRegressor']))
]

# Create the stacking regressor ensemble with a linear regression meta-model
stacking_regressor = StackingRegressor(estimators=base_models,
                                        final_estimator=LinearRegression(),
                                        n_jobs=-1)  # n_jobs=-1 to use all available CPU cores

# Train the stacking regressor ensemble
stacking_regressor.fit(X_train, y_train)

# Make predictions on the validation set
stacking_predictions = stacking_regressor.predict(X_val)

# Compute RMSLE for the ensemble model
stacking_rmsle = rmsle(np.expm1(y_val), np.expm1(stacking_predictions))
print("Stacking RMSLE:", stacking_rmsle)

# Make predictions on the test set
stack_predictions_test = stacking_regressor.predict(test_df_preproc)

submission_df = pd.DataFrame({'id': test_df['id'], 'Rings': np.expm1(stack_predictions_test)})
"""

'\n#Stacking\n\nfrom sklearn.ensemble import StackingRegressor\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.metrics import mean_squared_log_error\nimport numpy as np\n\ntop_params = {\n    \'HistGradientBoostingRegressor\': {\'learning_rate\' : 0.06418641358088736, \'max_depth\' : 9,\n                              \'max_iter\' : 854, \'max_leaf_nodes\' : 39,\n                              \'min_samples_leaf\': 7},\n    \'MLPRegressor\': {\'hidden_layer_sizes\': (68, 18, 56), \'activation\': \'relu\', \'alpha\': 5.0753793080466134e-05, \'learning_rate_init\': 0.0032188937689512057},\n    \'RandomForestRegressor\': {\'max_depth\': 19, \'max_features\': \'log2\', \'min_samples_split\': 8, \'min_samples_leaf\': 2, \'n_estimators\' : 770, \'random_state\' : 42},\n    \'ExtraTreesRegressor\': {\'n_estimators\': 272, \'max_features\': None, \'min_samples_split\': 14, \'max_depth\': 20}\n}\n\n\n\n# Define the base models with the tuned parameters\nbase_models = [\n    (\'Hi

Stacking RMSLE: 0.15095807587620627

Score RMSLE Stacking 0.14809, best so far

In [10]:
'''
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_log_error
import numpy as np

top_params = {
    'HistGradientBoostingRegressor': {'learning_rate' : 0.06418641358088736, 'max_depth' : 9,
                              'max_iter' : 854, 'max_leaf_nodes' : 39,
                              'min_samples_leaf': 7},
    'MLPRegressor': {'hidden_layer_sizes': (68, 18, 56), 'activation': 'relu', 'alpha': 5.0753793080466134e-05, 'learning_rate_init': 0.0032188937689512057},
    'RandomForestRegressor': {'max_depth': 19, 'max_features': 'log2', 'min_samples_split': 8, 'min_samples_leaf': 2, 'n_estimators' : 770, 'random_state' : 42},
    'ExtraTreesRegressor': {'n_estimators': 272, 'max_features': None, 'min_samples_split': 14, 'max_depth': 20}
}

# Define the base models
base_models = [
    ('HistGradientBoostingRegressor', HistGradientBoostingRegressor(**top_params['HistGradientBoostingRegressor'])),
    ('MLPRegressor', MLPRegressor(**top_params['MLPRegressor'])),
    ('RandomForestRegressor', RandomForestRegressor(**top_params['RandomForestRegressor'])),
    ('ExtraTreesRegressor', ExtraTreesRegressor(**top_params['ExtraTreesRegressor']))
]

# Create the voting regressor ensemble
voting_regressor = VotingRegressor(estimators=base_models, n_jobs=-1)  # n_jobs=-1 to use all available CPU cores

# Train the voting regressor ensemble
voting_regressor.fit(X_train, y_train)

# Make predictions on the validation set
voting_predictions = voting_regressor.predict(X_val)

# Compute RMSLE for the ensemble model
voting_rmsle = rmsle(np.expm1(y_val), np.expm1(voting_predictions))
print("Voting RMSLE:", voting_rmsle)

# Make predictions on the test set
voting_predictions_test = voting_regressor.predict(test_df_preproc)

submission_df = pd.DataFrame({'id': test_df['id'], 'Rings': np.expm1(voting_predictions_test)})
'''

'\nfrom sklearn.ensemble import VotingRegressor\nfrom sklearn.metrics import mean_squared_log_error\nimport numpy as np\n\ntop_params = {\n    \'HistGradientBoostingRegressor\': {\'learning_rate\' : 0.06418641358088736, \'max_depth\' : 9,\n                              \'max_iter\' : 854, \'max_leaf_nodes\' : 39,\n                              \'min_samples_leaf\': 7},\n    \'MLPRegressor\': {\'hidden_layer_sizes\': (68, 18, 56), \'activation\': \'relu\', \'alpha\': 5.0753793080466134e-05, \'learning_rate_init\': 0.0032188937689512057},\n    \'RandomForestRegressor\': {\'max_depth\': 19, \'max_features\': \'log2\', \'min_samples_split\': 8, \'min_samples_leaf\': 2, \'n_estimators\' : 770, \'random_state\' : 42},\n    \'ExtraTreesRegressor\': {\'n_estimators\': 272, \'max_features\': None, \'min_samples_split\': 14, \'max_depth\': 20}\n}\n\n# Define the base models\nbase_models = [\n    (\'HistGradientBoostingRegressor\', HistGradientBoostingRegressor(**top_params[\'HistGradientBoosting

Voting RMSLE: 0.15148521771560713

Score RMSLE: 0.1486

In [11]:

#weighting

"""
base_models = {
    'HistGradientBoostingRegressor': HistGradientBoostingRegressor,
    'MLPRegressor': MLPRegressor,
    'RandomForestRegressor': RandomForestRegressor,
    'ExtraTreesRegressor': ExtraTreesRegressor
}

# Top parameters for each model (replace these with your actual parameters)
top_params = {
    'HistGradientBoostingRegressor': {'learning_rate' : 0.06418641358088736, 'max_depth' : 9,
                              'max_iter' : 854, 'max_leaf_nodes' : 39,
                              'min_samples_leaf': 7},
    'MLPRegressor': {'hidden_layer_sizes': (68, 18, 56), 'activation': 'relu', 'alpha': 5.0753793080466134e-05, 'learning_rate_init': 0.0032188937689512057},
    'RandomForestRegressor': {'max_depth': 19, 'max_features': 'log2', 'min_samples_split': 8, 'min_samples_leaf': 2, 'n_estimators' : 770, 'random_state' : 42},
    'ExtraTreesRegressor': {'n_estimators': 272, 'max_features': None, 'min_samples_split': 14, 'max_depth': 20}
}


# Train each base model with the top hyperparameters
trained_models = {}
for model_name, model_class in base_models.items():
    model = model_class(**top_params.get(model_name, {}))
    model.fit(X_train, y_train)
    trained_models[model_name] = model

# Compute predictions for each base model on the validation set
base_model_predictions = {}
for model_name, model in trained_models.items():
    base_model_predictions[model_name] = np.expm1(model.predict(X_val))

def objective(trial):
    # Define the search space for weights
    weights = {}
    for model_name in base_models.keys():
        weights[model_name] = trial.suggest_float(model_name, 0.0, 2.0)

    # Compute ensemble prediction using weighted average
    ensemble_prediction = np.average([base_model_predictions[model_name] * weights[model_name] for model_name in base_models.keys()], axis=0)

    # Compute RMSLE for the ensemble model
    ensemble_rmsle = rmsle(np.expm1(y_val), ensemble_prediction)
    return ensemble_rmsle

# Optimize weights for the ensemble using Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best weights
best_weights = study.best_params
print("Best Weights:", best_weights)


# Compute ensemble prediction using the best weights
ensemble_prediction = np.average([base_model_predictions[model_name] * best_weights[model_name] for model_name in base_models.keys()], axis=0)

# Compute RMSLE for the ensemble model
ensemble_rmsle = rmsle(np.expm1(y_val), ensemble_prediction)
print("Ensemble RMSLE:", ensemble_rmsle)

"""

'\nbase_models = {\n    \'HistGradientBoostingRegressor\': HistGradientBoostingRegressor,\n    \'MLPRegressor\': MLPRegressor,\n    \'RandomForestRegressor\': RandomForestRegressor,\n    \'ExtraTreesRegressor\': ExtraTreesRegressor\n}\n\n# Top parameters for each model (replace these with your actual parameters)\ntop_params = {\n    \'HistGradientBoostingRegressor\': {\'learning_rate\' : 0.06418641358088736, \'max_depth\' : 9,\n                              \'max_iter\' : 854, \'max_leaf_nodes\' : 39,\n                              \'min_samples_leaf\': 7},\n    \'MLPRegressor\': {\'hidden_layer_sizes\': (68, 18, 56), \'activation\': \'relu\', \'alpha\': 5.0753793080466134e-05, \'learning_rate_init\': 0.0032188937689512057},\n    \'RandomForestRegressor\': {\'max_depth\': 19, \'max_features\': \'log2\', \'min_samples_split\': 8, \'min_samples_leaf\': 2, \'n_estimators\' : 770, \'random_state\' : 42},\n    \'ExtraTreesRegressor\': {\'n_estimators\': 272, \'max_features\': None, \'min_sa

Weighting

Ensemble RMSLE: 0.15157797556118202

Score is .14828

In [12]:
"""

base_model_predictions = {}
for model_name, model in trained_models.items():
    base_model_predictions[model_name] = np.expm1(model.predict(test_df_preproc))

# Compute ensemble prediction using the best weights
ensemble_prediction = np.average([base_model_predictions[model_name] * best_weights[model_name] for model_name in base_models.keys()], axis=0)

submission_df = pd.DataFrame({'id': test_df['id'], 'Rings': ensemble_prediction})

"""

"\n\nbase_model_predictions = {}\nfor model_name, model in trained_models.items():\n    base_model_predictions[model_name] = np.expm1(model.predict(test_df_preproc))\n\n# Compute ensemble prediction using the best weights\nensemble_prediction = np.average([base_model_predictions[model_name] * best_weights[model_name] for model_name in base_models.keys()], axis=0)\n\nsubmission_df = pd.DataFrame({'id': test_df['id'], 'Rings': ensemble_prediction})\n\n"

In [13]:
"""
model=ExtraTreesRegressor(max_depth=20, max_features=None, min_samples_split=14,
                    n_estimators=272, random_state=42)

model.fit(X_train,y_train)

y_pred=model.predict(X_val)

print(rmsle(np.expm1(y_val), np.expm1(y_pred)))

y_pred_test = model.predict(test_df_preproc)
y_pred_test = np.expm1(y_pred_test)


submission_df = pd.DataFrame({'id': test_df['id'], 'Rings': y_pred_test})
"""

"\nmodel=ExtraTreesRegressor(max_depth=20, max_features=None, min_samples_split=14,\n                    n_estimators=272, random_state=42)\n\nmodel.fit(X_train,y_train)\n\ny_pred=model.predict(X_val)\n\nprint(rmsle(np.expm1(y_val), np.expm1(y_pred)))\n\ny_pred_test = model.predict(test_df_preproc)\ny_pred_test = np.expm1(y_pred_test)\n\n\nsubmission_df = pd.DataFrame({'id': test_df['id'], 'Rings': y_pred_test})\n"

Extra Trees with tuning and scaling and with tuning 0.15479149738104986

In [14]:
"""

model=RandomForestRegressor(max_depth=19, max_features='log2', min_samples_leaf=2,
                      min_samples_split=8, n_estimators=770, random_state=42)

model.fit(X_train,y_train)

y_pred=model.predict(X_val)

print(rmsle(np.expm1(y_val), np.expm1(y_pred)))

y_pred_test = model.predict(test_df_preproc)
y_pred_test = np.expm1(y_pred_test)


submission_df = pd.DataFrame({'id': test_df['id'], 'Rings': y_pred_test})

"""

"\n\nmodel=RandomForestRegressor(max_depth=19, max_features='log2', min_samples_leaf=2,\n                      min_samples_split=8, n_estimators=770, random_state=42)\n\nmodel.fit(X_train,y_train)\n\ny_pred=model.predict(X_val)\n\nprint(rmsle(np.expm1(y_val), np.expm1(y_pred)))\n\ny_pred_test = model.predict(test_df_preproc)\ny_pred_test = np.expm1(y_pred_test)\n\n\nsubmission_df = pd.DataFrame({'id': test_df['id'], 'Rings': y_pred_test})\n\n"

RandomForest with tuning and scaling and with tuning 
0.15479149738104986

In [15]:
"""
model=MLPRegressor(alpha=5.0753793080466134e-05, 
                   hidden_layer_sizes=(68, 18, 56),
                     learning_rate_init=0.0032188937689512057)

model.fit(X_train,y_train)

y_pred=model.predict(X_val)

print(rmsle(np.expm1(y_val), np.expm1(y_pred)))

y_pred_test = model.predict(test_df_preproc)
y_pred_test = np.expm1(y_pred_test)


submission_df = pd.DataFrame({'id': test_df['id'], 'Rings': y_pred_test})
"""

"\nmodel=MLPRegressor(alpha=5.0753793080466134e-05, \n                   hidden_layer_sizes=(68, 18, 56),\n                     learning_rate_init=0.0032188937689512057)\n\nmodel.fit(X_train,y_train)\n\ny_pred=model.predict(X_val)\n\nprint(rmsle(np.expm1(y_val), np.expm1(y_pred)))\n\ny_pred_test = model.predict(test_df_preproc)\ny_pred_test = np.expm1(y_pred_test)\n\n\nsubmission_df = pd.DataFrame({'id': test_df['id'], 'Rings': y_pred_test})\n"

MLP with tuning 0.15479149738104986


In [16]:

model=HistGradientBoostingRegressor(learning_rate=0.06418641358088736, max_depth=9,
                              max_iter=854, max_leaf_nodes=39,
                              min_samples_leaf=7)


model.fit(X_train,y_train)

y_pred=model.predict(X_val)

print(rmsle(np.expm1(y_val), np.expm1(y_pred)))

y_pred_test = model.predict(test_df_preproc)
y_pred_test = np.expm1(y_pred_test)


submission_df = pd.DataFrame({'id': test_df['id'], 'Rings': y_pred_test})



0.15178314886210703


Hist grad - RMSLE NO scaling 0.15230436920990859

Hist grad - RMSLE with scaling 0.15169981299264404

In [17]:
#from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.inspection import permutation_importance

# Compute permutation importances
perm_importance = permutation_importance(model, X_val, y_val, n_repeats=10, random_state=42)

# Get feature importances
feature_importances = perm_importance.importances_mean

# Rank features
feature_indices_ranked = feature_importances.argsort()[::-1]



In [18]:
# Print ranked features
feature_names=X_val.columns
print("Feature Ranking:")
for index in feature_indices_ranked:
    column_name = feature_names[index]
    print(f"Feature {column_name}: Importance = {feature_importances[index]}")

Feature Ranking:
Feature Shell weight: Importance = 0.8000579508361503
Feature Whole weight.1: Importance = 0.5035295190992046
Feature Whole weight: Importance = 0.49520882284502593
Feature Height: Importance = 0.10329770521098096
Feature Whole weight.2: Importance = 0.09265166661427723
Feature Length: Importance = 0.026633028536960045
Feature Sex_I: Importance = 0.01747877559198673
Feature Diameter: Importance = 0.014005741189255804
Feature Sex_F: Importance = 0.00022863688305336806
Feature Sex_M: Importance = 4.495014492098548e-05


In [19]:
def preproc2(df):
    df_no_male=df.drop('Sex_M', axis=1)
    df_no_adult=df_no_male.drop('Sex_F', axis=1)
    df_no_dia=df_no_adult.drop('Diameter', axis=1)
    return df_no_dia

In [20]:

X_2=preproc2(X)
test_df_preproc_2=preproc2(test_df_preproc)

X_train2, X_val2, y_train2, y_val2 = train_test_split(X_2, 
                                                  target, 
                                                  test_size=.2,  
                                                  random_state = 42)

In [21]:

#Stacking

from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import numpy as np

top_params = {
    'HistGradientBoostingRegressor': {'learning_rate' : 0.06418641358088736, 'max_depth' : 9,
                              'max_iter' : 854, 'max_leaf_nodes' : 39,
                              'min_samples_leaf': 7},
    'MLPRegressor': {'hidden_layer_sizes': (68, 18, 56), 'activation': 'relu', 'alpha': 5.0753793080466134e-05, 'learning_rate_init': 0.0032188937689512057},
    'RandomForestRegressor': {'max_depth': 19, 'max_features': 'log2', 'min_samples_split': 8, 'min_samples_leaf': 2, 'n_estimators' : 770, 'random_state' : 42},
    'ExtraTreesRegressor': {'n_estimators': 272, 'max_features': None, 'min_samples_split': 14, 'max_depth': 20}
}



# Define the base models with the tuned parameters
base_models = [
    ('HistGradientBoostingRegressor', HistGradientBoostingRegressor(**top_params['HistGradientBoostingRegressor'])),
    ('MLPRegressor', MLPRegressor(**top_params['MLPRegressor'])),
    ('RandomForestRegressor', RandomForestRegressor(**top_params['RandomForestRegressor'])),
    ('ExtraTreesRegressor', ExtraTreesRegressor(**top_params['ExtraTreesRegressor']))
]

# Create the stacking regressor ensemble with a linear regression meta-model
stacking_regressor = StackingRegressor(estimators=base_models,
                                        final_estimator=LinearRegression(),
                                        n_jobs=-1)  # n_jobs=-1 to use all available CPU cores

# Train the stacking regressor ensemble
stacking_regressor.fit(X_train2, y_train2)

# Make predictions on the validation set
stacking_predictions = stacking_regressor.predict(X_val2)

# Compute RMSLE for the ensemble model
stacking_rmsle = rmsle(np.expm1(y_val2), np.expm1(stacking_predictions))
print("Stacking RMSLE:", stacking_rmsle)

# Make predictions on the test set
stack_predictions_test = stacking_regressor.predict(test_df_preproc_2)

submission_df = pd.DataFrame({'id': test_df['id'], 'Rings': np.expm1(stack_predictions_test)})


Stacking RMSLE: 0.1514023209661915


In [22]:
submission_df.set_index('id')

Unnamed: 0_level_0,Rings
id,Unnamed: 1_level_1
90615,9.876003
90616,9.752721
90617,10.048343
90618,10.436299
90619,7.442727
...,...
151021,6.257160
151022,9.090482
151023,11.440596
151024,12.569963


In [23]:
#Converts the dataframe to CSV file for submission to contest
submission_df.to_csv('submission.csv', index=False)