### Load data and add any columns by preprocessing the dataframe or merging with others

In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

from sklearn.utils import shuffle

In [66]:
data = pd.read_csv('data/combined_df.csv')
data = data[
    (data['Year'] >= 2020) &
    (data['Headliner'].str.contains('"', na=False)) &
    (~data['Support'].isna())
  ]
data['Event Date'] = pd.to_datetime(data['Event Date'])
data['day_of_week'] = data['Event Date'].dt.day_of_week
data.head(3)

  data = pd.read_csv('data/combined_df.csv')


Unnamed: 0,Event Date,Headliner,sp artist_name,sp artist_genre,sp followers,sp popularity,yt name,yt Channel ID,yt Title,yt Description,...,Genre,Avg. Tickets Sold,Avg. Gross USD,Avg. Event Capacity,Avg. Capacity Sold,Ticket Price Min USD,Ticket Price Max USD,Ticket Price Avg. USD,Month,day_of_week
100,2024-08-31,"""Summer of '99 and Beyond Festival""",,,,,,,,,...,Multi-Genre,23332.0,1441577.0,23332.0,100%,39.5,159.5,61.79,8,5
117,2024-08-31,"""Reggae Fest Massive""",,,,,,,,,...,Reggae,12791.0,1418343.5,12791.0,100%,43.5,343.5,110.89,8,5
133,2024-08-30,"""DC Jazz Festival""",,,,,,,,,...,Multi-Genre,2394.0,252568.0,2500.0,95%,55.2,299.0,105.5,8,4


### Extract features

In [82]:
# check for missing values
print(f'Missing sp data: {sum(data["sp followers"].isna())} rows')
print(f'Missing population data: {sum(data["Total population"].isna())} rows')
print(f'Missing yt data: {sum(data["yt Subscriber Count"].isna())} rows')
print(f'Missing monthly listeners data: {sum(data["monthly_listeners"].isna())} rows')

Missing sp data: 2516 rows
Missing population data: 2516 rows
Missing yt data: 2523 rows
Missing monthly listeners data: 365 rows


In [68]:
top_20_markets = data['Market'].value_counts().iloc[:20].index
one_hot_encoded = pd.get_dummies(data['Market'])
one_hot_encoded = one_hot_encoded[top_20_markets]
data = data.join(one_hot_encoded)

In [69]:
features = ['Avg. Event Capacity',
            'Ticket Price Min USD', 
            'Ticket Price Max USD',
            'Year',
            'monthly_listeners',
            'Month',
            'day_of_week'] + list(top_20_markets)

In [70]:
X = data[features]
y = data['Avg. Gross USD']
X = X.dropna()
y = y[X.index]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 8)

### Linear Regression

In [71]:
from sklearn.linear_model import LinearRegression

In [72]:
linear_model = LinearRegression()
rkf = RepeatedKFold(n_splits=5, n_repeats=10)

cv_rmse = np.sqrt(-cross_val_score(linear_model, X_train, y_train, cv=rkf, scoring='neg_mean_squared_error'))
cv_mae = -cross_val_score(linear_model, X_train, y_train, cv=rkf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(linear_model, X_train, y_train, cv=rkf, scoring='r2')

print("Linear Regression Cross-Validation Performance:")
print("Cross-Validation RMSE: Mean =", cv_rmse.mean())
print("Cross-Validation MAE: Mean =", cv_mae.mean())
print("Cross-Validation R^2: Mean =", cv_r2.mean())

Linear Regression Cross-Validation Performance:
Cross-Validation RMSE: Mean = 986732.7589375266
Cross-Validation MAE: Mean = 280552.1757676258
Cross-Validation R^2: Mean = -0.8892955277493467


In [73]:
# test set
y_pred = linear_regression_model.predict(X_test)

rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Linear Regression Performance on Test Set:")
print("RMSE:", rmse)
print("MAE:", mae)
print("R^2:", r2)

Linear Regression Performance on Test Set:
RMSE: 810848.9341045264
MAE: 276614.95212524367
R^2: 0.6516370132661887


### Random Forest

In [74]:
from sklearn.ensemble import RandomForestRegressor

In [75]:
random_forest_model = RandomForestRegressor(n_estimators=100)
rkf = RepeatedKFold(n_splits=5, n_repeats=10)

cv_rmse = np.sqrt(-cross_val_score(random_forest_model, X_train, y_train, cv=rkf, scoring='neg_mean_squared_error'))
cv_mae = -cross_val_score(random_forest_model, X_train, y_train, cv=rkf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(random_forest_model, X_train, y_train, cv=rkf, scoring='r2')

print("Random Forest Regressor Cross-Validation Performance:")
print("Cross-Validation RMSE: Mean =", cv_rmse.mean())
print("Cross-Validation MAE: Mean =", cv_mae.mean())
print("Cross-Validation R^2: Mean =", cv_r2.mean())

Random Forest Regressor Cross-Validation Performance:
Cross-Validation RMSE: Mean = 526755.2413959436
Cross-Validation MAE: Mean = 141719.17209888218
Cross-Validation R^2: Mean = 0.6537308464525842


In [76]:
# test set
random_forest_model.fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)

rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Random Forest Regressor Performance on Test Set:")
print("RMSE:", rmse)
print("MAE:", mae)
print("R^2:", r2)

Random Forest Regressor Performance on Test Set:
RMSE: 770969.9098008629
MAE: 153341.1690536748
R^2: 0.6850606247271125


### XGBoost Regressor

In [79]:
import xgboost as xgb
from xgboost import XGBRegressor

In [80]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
rkf = RepeatedKFold(n_splits=5, n_repeats=30)

cv_rmse = np.sqrt(-cross_val_score(xgb_model, X_train, y_train, cv=rkf, scoring='neg_mean_squared_error'))
cv_mae = -cross_val_score(xgb_model, X_train, y_train, cv=rkf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(xgb_model, X_train, y_train, cv=rkf, scoring='r2')

print("XGBoost Regressor Cross-Validation Performance:")
print("Cross-Validation RMSE: Mean =", cv_rmse.mean())
print("Cross-Validation MAE: Mean =", cv_mae.mean())
print("Cross-Validation R^2: Mean =", cv_r2.mean())

XGBoost Regressor Cross-Validation Performance:
Cross-Validation RMSE: Mean = 557217.8708558803
Cross-Validation MAE: Mean = 148579.3967075906
Cross-Validation R^2: Mean = 0.6261177818150814


In [81]:
# test set
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("XGBoost Regressor Performance on Test Set:")
print('Test RMSE:', rmse)
print('Test MAE:', mae)
print('Test R^2:', r2)

XGBoost Regressor Performance on Test Set:
Test RMSE: 898295.5376232901
Test MAE: 174407.35757934808
Test R^2: 0.5724463782410321


### For saving a model

In [6]:
import pickle

filename = 'model/example.pkl'

# Open the file to save the model
with open(filename, 'wb') as file:
    pickle.dump(xgb_model, file)