### Load data and add any columns by preprocessing the dataframe or merging with others

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.utils import shuffle

In [2]:
data = pd.read_csv('data/combined_df.csv')
data = data[
    (data['Year'] >= 2020) &
    (data['Headliner'].str.contains('"', na=False)) &
    (~data['Support'].isna()) &
    (data['Genre'] != 'Family Entertainment')
  ]
data['Event Date'] = pd.to_datetime(data['Event Date'])
data['day_of_week'] = data['Event Date'].dt.day_of_week
data.head(3)

  data = pd.read_csv('data/combined_df.csv')


Unnamed: 0,Event Date,Headliner,sp artist_name,sp artist_genre,sp followers,sp popularity,yt name,yt Channel ID,yt Title,yt Description,...,Genre,Avg. Tickets Sold,Avg. Gross USD,Avg. Event Capacity,Avg. Capacity Sold,Ticket Price Min USD,Ticket Price Max USD,Ticket Price Avg. USD,Month,day_of_week
100,2024-08-31,"""Summer of '99 and Beyond Festival""",,,,,,,,,...,Multi-Genre,23332.0,1441577.0,23332.0,100%,39.5,159.5,61.79,8,5
117,2024-08-31,"""Reggae Fest Massive""",,,,,,,,,...,Reggae,12791.0,1418343.5,12791.0,100%,43.5,343.5,110.89,8,5
133,2024-08-30,"""DC Jazz Festival""",,,,,,,,,...,Multi-Genre,2394.0,252568.0,2500.0,95%,55.2,299.0,105.5,8,4


### Extract features

In [3]:
# check for missing values
print(f'Missing sp data: {sum(data["sp followers"].isna())} rows')
print(f'Missing population data: {sum(data["Total population"].isna())} rows')
print(f'Missing yt data: {sum(data["yt Subscriber Count"].isna())} rows')
print(f'Missing monthly listeners data: {sum(data["monthly_listeners"].isna())} rows')

Missing sp data: 2258 rows
Missing population data: 2258 rows
Missing yt data: 2265 rows
Missing monthly listeners data: 213 rows


In [4]:
top_20_markets = data['Market'].value_counts().iloc[:20].index
one_hot_encoded = pd.get_dummies(data['Market'])
one_hot_encoded = one_hot_encoded[top_20_markets]
data = data.join(one_hot_encoded)

In [5]:
features = ['Avg. Event Capacity',
            'Ticket Price Min USD', 
            'Ticket Price Max USD',
            'Year',
            'monthly_listeners',
            'Month',
            'day_of_week'] + list(top_20_markets)

In [6]:
X = data[features]
y = data['Avg. Gross USD']
X = X.dropna()
y = y[X.index]
# no need to split train/test (since we don’t have that much rows) —> can just take the average of the metrics from cross_val_score
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 8)

### Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
linear_model = LinearRegression()
rkf = RepeatedKFold(n_splits=5, n_repeats=10)

cv_rmse = np.sqrt(-cross_val_score(linear_model, X, y, cv=rkf, scoring='neg_mean_squared_error'))
cv_mae = -cross_val_score(linear_model, X, y, cv=rkf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(linear_model, X, y, cv=rkf, scoring='r2')

print("Linear Regression Cross-Validation Performance:")
print("Cross-Validation RMSE: Mean =", cv_rmse.mean())
print("Cross-Validation MAE: Mean =", cv_mae.mean())
print("Cross-Validation R^2: Mean =", cv_r2.mean())

Linear Regression Cross-Validation Performance:
Cross-Validation RMSE: Mean = 795801.0984544507
Cross-Validation MAE: Mean = 254797.89878060442
Cross-Validation R^2: Mean = -1.1400147186132303


### Random Forest

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# default random forest model
random_forest_model = RandomForestRegressor(n_estimators=100)
rkf = RepeatedKFold(n_splits=5, n_repeats=10)

cv_rmse = np.sqrt(-cross_val_score(random_forest_model, X, y, cv=rkf, scoring='neg_mean_squared_error'))
cv_mae = -cross_val_score(random_forest_model, X, y, cv=rkf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(random_forest_model, X, y, cv=rkf, scoring='r2')

print("Random Forest Regressor Cross-Validation Performance:")
print("Cross-Validation RMSE: Mean =", cv_rmse.mean())
print("Cross-Validation MAE: Mean =", cv_mae.mean())
print("Cross-Validation R^2: Mean =", cv_r2.mean())

In [10]:
best_random_forest_model = RandomForestRegressor(n_estimators=100)
rkf = RepeatedKFold(n_splits=5, n_repeats=10)

In [11]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', 1.0]
}

grid_search = GridSearchCV(
    estimator=best_random_forest_model,
    param_grid=param_grid,
    cv=5,  
    scoring='r2',
    n_jobs=-1
)
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validated R^2:", grid_search.best_score_)

Best parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best cross-validated R^2: 0.7838963373643677


In [12]:
best_random_forest_model = grid_search.best_estimator_

cv_rmse = np.sqrt(-cross_val_score(best_random_forest_model, X, y, cv=rkf, scoring='neg_mean_squared_error'))
cv_mae = -cross_val_score(best_random_forest_model, X, y, cv=rkf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(best_random_forest_model, X, y, cv=rkf, scoring='r2')

print("Tuned Random Forest Regressor Cross-Validation Performance:")
print("Cross-Validation RMSE: Mean =", cv_rmse.mean())
print("Cross-Validation MAE: Mean =", cv_mae.mean())
print("Cross-Validation R^2: Mean =", cv_r2.mean())

Tuned Random Forest Regressor Cross-Validation Performance:
Cross-Validation RMSE: Mean = 387911.75691995444
Cross-Validation MAE: Mean = 131839.5170897241
Cross-Validation R^2: Mean = 0.7980930315260997


In [13]:
print(best_random_forest_model.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 20, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 300, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


Random Forest Best parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

### XGBoost Regressor

In [14]:
import xgboost as xgb
from xgboost import XGBRegressor

In [None]:
# default xgboost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
rkf = RepeatedKFold(n_splits=5, n_repeats=30)

cv_rmse = np.sqrt(-cross_val_score(xgb_model, X, y, cv=rkf, scoring='neg_mean_squared_error'))
cv_mae = -cross_val_score(xgb_model, X, y, cv=rkf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(xgb_model, X, y, cv=rkf, scoring='r2')

print("XGBoost Regressor Cross-Validation Performance:")
print("Cross-Validation RMSE: Mean =", cv_rmse.mean())
print("Cross-Validation MAE: Mean =", cv_mae.mean())
print("Cross-Validation R^2: Mean =", cv_r2.mean())

In [15]:
best_xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
rkf = RepeatedKFold(n_splits=5, n_repeats=30)

In [16]:
param_grid = {
    'n_estimators': [50, 100, 200],  
    'learning_rate': [0.01, 0.1, 0.2],  
    'max_depth': [3, 5, 7],  
    'min_child_weight': [1, 3, 5],  
    'subsample': [0.8, 0.9, 1.0],  
    'colsample_bytree': [0.8, 0.9, 1.0],  
    'gamma': [0, 0.1, 0.3],  
    'reg_alpha': [0, 0.1, 0.5],  
    'reg_lambda': [1, 1.5, 2],
}

random_search = RandomizedSearchCV(estimator=best_xgb_model, param_distributions=param_grid, 
                                   n_iter=20, scoring='r2', 
                                   cv=rkf, n_jobs=-1, random_state=42, verbose=1)

random_search.fit(X, y)

print("Best Parameters:", random_search.best_params_)
print("Best Cross-validated R^2:", random_search.best_score_)

Fitting 150 folds for each of 20 candidates, totalling 3000 fits
Best Parameters: {'subsample': 0.9, 'reg_lambda': 1.5, 'reg_alpha': 0.1, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
Best Cross-validated R^2: 0.822994471473593


In [19]:
best_xgb_model = random_search.best_estimator_

cv_rmse = np.sqrt(-cross_val_score(best_xgb_model, X, y, cv=rkf, scoring='neg_mean_squared_error'))
cv_mae = -cross_val_score(best_xgb_model, X, y, cv=rkf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(best_xgb_model, X, y, cv=rkf, scoring='r2')

print("Tuned XGBoost Regressor Cross-Validation Performance:")
print("Cross-Validation RMSE: Mean =", cv_rmse.mean())
print("Cross-Validation MAE: Mean =", cv_mae.mean())
print("Cross-Validation R^2: Mean =", cv_r2.mean())

Tuned XGBoost Regressor Cross-Validation Performance:
Cross-Validation RMSE: Mean = 351398.5454409818
Cross-Validation MAE: Mean = 129679.32760693309
Cross-Validation R^2: Mean = 0.812226392694275


In [20]:
print(best_xgb_model.get_params())

{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.8, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': 0, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 3, 'max_leaves': None, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': 0.1, 'reg_lambda': 1.5, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.9, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}


XGBoost Parameters: {'subsample': 0.9, 'reg_lambda': 1.5, 'reg_alpha': 0.1, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}

### For saving a model

In [None]:
import pickle

filename = 'model/example.pkl'

# Open the file to save the model
with open(filename, 'wb') as file:
    pickle.dump(xgb_model, file)