In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, FunctionTransformer
import xgboost as xgb

train_df = pd.read_csv('data/train.csv')

In [54]:
label_feature = 'Danceability'
numerical_features = ['Energy', 'Loudness', 'Speechiness', 'Acousticness',
                      'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Duration_ms', 'Stream',
                      'Views', 'Likes', 'Comments']

categorical_features = ['Album_type', 'Key', 'Licensed', 'official_video']

string_features = [
    'Track', 'Artist', 'Composer', 'Album', 'Title', 'Channel', 'Description'
]

# Define used features

In [55]:
# label
label_feature = 'Danceability'

# features
dont_transform_features = ['Energy', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence']
normal_transform_features = ['Tempo']
power_transform_features = ['Loudness', 'Duration_ms', 'Stream', 'Views', 'Likes', 'Comments']
categorical_features = ['Album_type', 'Key', 'Licensed', 'official_video']


features_columns = dont_transform_features + normal_transform_features + power_transform_features + categorical_features
label_column = label_feature
train_df = train_df[features_columns + [label_column]]

# preprocess features

In [56]:
# Transform features and label
normal_transformer = StandardScaler()
skewed_transformer = PowerTransformer()
min_max_transformer = MinMaxScaler()

In [57]:
# Transform features and label
normal_transformer = StandardScaler()
power_transformer = PowerTransformer()
label_transformer = MinMaxScaler()

# transform features & label
train_df[normal_transform_features] = normal_transformer.fit_transform(train_df[normal_transform_features])
train_df[power_transform_features] = power_transformer.fit_transform(train_df[power_transform_features])
train_df[label_column] = label_transformer.fit_transform(train_df[[label_column]])

# One-hot encode categorical features
train_df = pd.get_dummies(train_df, columns=categorical_features)

In [58]:
X, Y = train_df.drop(columns=[label_column]), train_df[label_column]

# Define the parameter grid for grid search
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

def custom_scorer(y_true, y_pred):
    y_pred = label_transformer.inverse_transform([y_pred])
    y_pred = np.clip(np.round(y_pred), 0, 9)
    y_true = label_transformer.inverse_transform([y_true])
    print(mean_absolute_error(y_true, y_pred))
    return -mean_absolute_error(y_true, y_pred)

# Create an instance of the HistGradientBoostingRegressor
model = xgb.XGBRegressor()

# Perform grid search with cross-validation
scoring = make_scorer(custom_scorer)
grid_search = GridSearchCV(model, param_grid, scoring=scoring, cv=5)
grid_search.fit(X, Y)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

# Print the best parameters and score
print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Create a new instance of HistGradientBoostingRegressor with the best parameters
model = HistGradientBoostingRegressor(**best_params)

# Fit the best HistGradientBoostingRegressor on the entire dataset
model = model.fit(X, Y)

1.6895748398369248
1.7355853232382061
1.7018054746651137
1.7507280139778685
1.8025626092020968
1.6860803727431566
1.7402446126965638
1.7067559697146186
1.7623762376237624
1.8066394874781595
1.6624927198602213
1.6878276062900408
1.6569598136284216
1.717239370995923
1.7609202096680256
1.662201514269074
1.6942341292952825
1.672976121141526
1.730343622597554
1.7550960978450787
1.6482236458940012
1.678800232964473
1.6575422248107163
1.7041351193942924
1.7376237623762376
1.6645311589982528
1.6887012230634828
1.6566686080372743
1.7076295864880606
1.745486313337216
1.6243447874199184
1.6645311589982528
1.6327897495631916
1.692195690157251
1.720442632498544
1.6435643564356435
1.6706464764123472
1.63075131042516
1.6901572510192195
1.7417006406523006
1.648806057076296
1.6694816540477577
1.6368666278392545
1.6750145602795574
1.7457775189283635
1.6421083284799067
1.684041933605125


In [None]:
test_df = pd.read_csv('data/test.csv')
ids = test_df['id']
test_df = test_df[features_columns]
test_df = pd.get_dummies(test_df, columns=categorical_features)

# Transform the test data
test_df[normal_transform_features] = normal_transformer.transform(test_df[normal_transform_features])
test_df[power_transform_features] = power_transformer.transform(test_df[power_transform_features])

# Make predictions on the test set
test_predictions = model.predict(test_df)
test_predictions = label_transformer.inverse_transform([test_predictions])
test_predictions = np.clip(np.round(test_predictions), 0, 9)
test_predictions = test_predictions.astype(int).reshape(-1)

# Prepare the submission dataframe
submission_df = pd.DataFrame({'id': ids, 'Danceability': test_predictions})

# Save the submission to a CSV file
submission_df.to_csv('submission.csv', index=False)