In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

from sklearn.utils import shuffle

import lightgbm as lgb
from lightgbm import LGBMRegressor

In [23]:
df = pd.read_csv("data/combined_df.csv")
df = df[
    (df['Year'] >= 2020) &
    (~(df['Headliner'].str.contains('"', na=False))) & 
    (df['Genre'] != 'Family Entertainment') &
    (df['Ticket Price Min USD'] > 0) &
    (df['Ticket Price Min USD'] < df['Ticket Price Max USD'])
]
data = df.dropna()

data.head(3)

  df = pd.read_csv("data/combined_df.csv")


Unnamed: 0,Event Date,Headliner,sp artist_name,sp artist_genre,sp followers,sp popularity,yt name,yt Channel ID,yt Title,yt Description,...,Genre,Avg. Tickets Sold,Avg. Gross USD,Avg. Event Capacity,Avg. Capacity Sold,Ticket Price Min USD,Ticket Price Max USD,Ticket Price Avg. USD,Month,day_of_week
0,2024-09-18,Creed,Creed,"['alternative metal', 'nu metal', 'post-grunge...",3527070.0,74.0,Creed,UCP-tFf_VMQzhyeKMONL1KvQ,Creed,Subscribe to Creed's Official Youtube Channel ...,...,Pop / Rock,20295.0,1228939.0,20295.0,100%,39.5,225.0,60.55,9,2
1,2024-09-14,Creed,Creed,"['alternative metal', 'nu metal', 'post-grunge...",3527070.0,74.0,Creed,UCP-tFf_VMQzhyeKMONL1KvQ,Creed,Subscribe to Creed's Official Youtube Channel ...,...,Pop / Rock,16308.0,1374174.0,16308.0,100%,39.5,225.0,84.26,9,5
3,2024-09-13,Creed,Creed,"['alternative metal', 'nu metal', 'post-grunge...",3527070.0,74.0,Creed,UCP-tFf_VMQzhyeKMONL1KvQ,Creed,Subscribe to Creed's Official Youtube Channel ...,...,Pop / Rock,14995.0,1402969.0,14995.0,100%,39.5,225.0,93.56,9,4


In [24]:
top_20_markets = data['Market'].value_counts().iloc[:20].index
one_hot_encoded = pd.get_dummies(data['Market'])
one_hot_encoded = one_hot_encoded[top_20_markets]
data = data.join(one_hot_encoded)

In [25]:
data.head(2)

Unnamed: 0,Event Date,Headliner,sp artist_name,sp artist_genre,sp followers,sp popularity,yt name,yt Channel ID,yt Title,yt Description,...,San Francisco-Oakland-San Jose,Denver,Charlotte,Pittsburgh,Philadelphia,Grand Rapids-Kalamazoo-Battle Creek,Phoenix (Prescott),Madison,Detroit,Austin
0,2024-09-18,Creed,Creed,"['alternative metal', 'nu metal', 'post-grunge...",3527070.0,74.0,Creed,UCP-tFf_VMQzhyeKMONL1KvQ,Creed,Subscribe to Creed's Official Youtube Channel ...,...,False,False,False,False,False,False,False,False,False,False
1,2024-09-14,Creed,Creed,"['alternative metal', 'nu metal', 'post-grunge...",3527070.0,74.0,Creed,UCP-tFf_VMQzhyeKMONL1KvQ,Creed,Subscribe to Creed's Official Youtube Channel ...,...,False,False,False,False,False,False,False,False,False,False


In [26]:
# cleaned up data to avoid issues in training
label_encoder = LabelEncoder()
data['Encoded Headliner'] = label_encoder.fit_transform(data['Headliner'])

## Model 1

In [6]:
features = ['Encoded Headliner',
            'yt Subscriber Count',
            'monthly_listeners',
            'Avg. Event Capacity',
            'Ticket Price Min USD', 
            'Ticket Price Max USD',
            'Month',
            'day_of_week',
            'Year'] + list(top_20_markets)

In [7]:
X = data[features]
y = data['Avg. Gross USD']
X = X.dropna()
y = y[X.index]

# cleaned up data to avoid issues in training
X.columns = X.columns.str.replace(r"[^a-zA-Z0-9_]", "_", regex=True)

In [8]:
lgbm_model = LGBMRegressor(objective='regression', metric='rmse', verbose=-1)

rkf = RepeatedKFold(n_splits=5, n_repeats=30, random_state=42)

cv_rmse = np.sqrt(-cross_val_score(lgbm_model, X, y, cv=rkf, scoring='neg_mean_squared_error'))
cv_mae = -cross_val_score(lgbm_model, X, y, cv=rkf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(lgbm_model, X, y, cv=rkf, scoring='r2')

print("LightGBM Regressor Cross-Validation Performance:")
print("Cross-Validation RMSE: Mean =", cv_rmse.mean())
print("Cross-Validation MAE: Mean =", cv_mae.mean())
print("Cross-Validation R^2: Mean =", cv_r2.mean())

LightGBM Regressor Cross-Validation Performance:
Cross-Validation RMSE: Mean = 204299.33596982103
Cross-Validation MAE: Mean = 70644.39771715697
Cross-Validation R^2: Mean = 0.8927395625390822


## Model 2 (Log-transformed the target variable to reduce skewness: y = np.log1p(y))

In [27]:
features = ['Encoded Headliner',
            'yt Subscriber Count',
            'monthly_listeners',
            'Avg. Event Capacity',
            'Ticket Price Min USD', 
            'Ticket Price Max USD',
            'Month',
            'day_of_week',
            'Year'] + list(top_20_markets)

In [28]:
X = data[features]
y = data['Avg. Gross USD']
X = X.dropna()
y = y[X.index]
y = np.log1p(y)

# cleaned up data to avoid issues in training
X.columns = X.columns.str.replace(r"[^a-zA-Z0-9_]", "_", regex=True)

In [29]:
# Define custom scoring functions
def rmse_log_transformed(y_true, y_pred):
    y_true_original = np.exp(y_true)
    y_pred_original = np.exp(y_pred)
    return np.sqrt(mean_squared_error(y_true_original, y_pred_original))

def mae_log_transformed(y_true, y_pred):
    y_true_original = np.exp(y_true)
    y_pred_original = np.exp(y_pred)
    return mean_absolute_error(y_true_original, y_pred_original)

def r2_log_transformed(y_true, y_pred):
    y_true_original = np.exp(y_true)
    y_pred_original = np.exp(y_pred)
    return r2_score(y_true_original, y_pred_original)

# Convert them into scorers
rmse_scorer = make_scorer(rmse_log_transformed, greater_is_better=False)  # Negate for minimization
mae_scorer = make_scorer(mae_log_transformed, greater_is_better=False)    # Negate for minimization
r2_scorer = make_scorer(r2_log_transformed)

In [31]:
lgbm_model = LGBMRegressor(objective='regression', metric='rmse', verbose=-1)

rkf = RepeatedKFold(n_splits=5, n_repeats=30, random_state=42)

cv_rmse = np.sqrt(-cross_val_score(lgbm_model, X, y, cv=rkf, scoring=rmse_scorer))
cv_mae = -cross_val_score(lgbm_model, X, y, cv=rkf, scoring=mae_scorer)
cv_r2 = cross_val_score(lgbm_model, X, y, cv=rkf, scoring=r2_scorer)

# Print results
print("LightGBM Regressor Cross-Validation Performance (Rescaled):")
print("Cross-Validation RMSE: Mean =", cv_rmse.mean())
print("Cross-Validation MAE: Mean =", cv_mae.mean())
print("Cross-Validation R^2: Mean =", cv_r2.mean())

LightGBM Regressor Cross-Validation Performance (Rescaled):
Cross-Validation RMSE: Mean = 426.52972451977837
Cross-Validation MAE: Mean = 65816.30402868996
Cross-Validation R^2: Mean = 0.9132205834109993
