In [1]:
import pandas as pd
import json
import time
import numpy as np
from IPython.display import display
import ipywidgets as widgets
import holidays

In [3]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from catboost import CatBoostRegressor, Pool

import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# Load data
df = pd.read_csv('combined_df.csv')
# Filter the dataset without dropping NaN values (for training set)
filtered_data = df[
    (df['Year'] >= 2020) &
    (~(df['Headliner'].str.contains('"', na=False))) &
    (df['Genre'] != 'Family Entertainment') &
    (df['Ticket Price Min USD'] > 0) &
    (df['Ticket Price Min USD'] < df['Ticket Price Max USD'])
]

# Filter the dataset with NaN rows dropped (for testing set)
filtered_data_no_na = filtered_data.dropna()

# Split 30% of the cleaned dataset (no NaNs) into the testing set
_, test_data = train_test_split(filtered_data_no_na, test_size=0.3, random_state=42)

# Ensure the training set is mutually exclusive by removing test rows from the original filtered dataset
train_data = filtered_data.loc[~filtered_data.index.isin(test_data.index)]

# Check the results
print(f"Total filtered data size: {len(filtered_data)}")
print(f"Training set size (including NaNs): {len(train_data)}")
print(f"Testing set size (no NaNs): {len(test_data)}")

  df = pd.read_csv('combined_df.csv')


Total filtered data size: 79691
Training set size (including NaNs): 78682
Testing set size (no NaNs): 1009


In [5]:
feature_columns = ['Avg. Event Capacity', 'Ticket Price Min USD','Ticket Price Max USD','headliner_monthly_listeners'] 
X_train = train_data[feature_columns]
y_train = train_data['Avg. Gross USD']

In [6]:
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function="RMSE",
    verbose=100  # Print progress every 100 iterations
)

In [7]:
# Define scoring metrics
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)  # RMSE
r2_scorer = make_scorer(r2_score)  # R²


# CROSS VALIDATION
# Set up cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Perform cross-validation for RMSE
cv_rmse_scores = cross_val_score(cat_model, X_train, y_train, cv=kf, scoring=rmse_scorer)
formatted_rmse_scores = [int(round(-score)) for score in cv_rmse_scores]  # Negate each score, round, and convert to integer
mean_cv_rmse = int(round(-np.mean(cv_rmse_scores)))  # Negate the mean, round, and convert to integer
print("Cross-validation RMSE scores:", formatted_rmse_scores)
print("Mean CV RMSE:", mean_cv_rmse)
# Perform cross-validation for R²
cv_r2_scores = cross_val_score(cat_model, X_train, y_train, cv=kf, scoring=r2_scorer)
formatted_r2_scores = [round(score,3) for score in cv_r2_scores] 
print("Cross-validation R² scores:", formatted_r2_scores)
print("Mean CV R²:", round(np.mean(cv_r2_scores),3))

# TEST ON TESTING SET
# Make predictions on the testing data
X_test = test_data[feature_columns]
y_test = test_data['Avg. Gross USD']
cat_model.fit(X_train, y_train)
y_pred = cat_model.predict(X_test)
# Calculate R²
test_r2 = r2_score(y_test, y_pred)
print(f"Test R²: {test_r2:.3f}")
# Calculate RMSE
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Test RMSE: {test_rmse:.2f}")

0:	learn: 670639.8448690	total: 160ms	remaining: 2m 39s
100:	learn: 190316.6770149	total: 523ms	remaining: 4.65s
200:	learn: 168956.7598232	total: 870ms	remaining: 3.46s
300:	learn: 157531.1152684	total: 1.24s	remaining: 2.87s
400:	learn: 148382.9731046	total: 1.62s	remaining: 2.42s
500:	learn: 141778.3319084	total: 1.99s	remaining: 1.98s
600:	learn: 136334.4866269	total: 2.38s	remaining: 1.58s
700:	learn: 132190.0415291	total: 2.79s	remaining: 1.19s
800:	learn: 128610.4554628	total: 3.18s	remaining: 790ms
900:	learn: 125356.9729310	total: 3.57s	remaining: 393ms
999:	learn: 122745.9786549	total: 3.95s	remaining: 0us
0:	learn: 675547.6400404	total: 4.41ms	remaining: 4.41s
100:	learn: 190503.8949515	total: 390ms	remaining: 3.47s
200:	learn: 170002.7043376	total: 784ms	remaining: 3.12s
300:	learn: 157097.0629819	total: 1.19s	remaining: 2.77s
400:	learn: 147562.8167576	total: 1.57s	remaining: 2.34s
500:	learn: 141023.4794401	total: 1.94s	remaining: 1.93s
600:	learn: 136071.2366901	total: 2