In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
import numpy as np
from scipy.stats import uniform, randint
from skopt import BayesSearchCV
from skopt.space import Real, Integer

file_path = '/Users/zmy/Documents/A Round Ent/cleaned_data_allpop100k.csv'
data = pd.read_csv(file_path)

In [15]:
features = [
    '# Shows',
    'Avg. Tickets Sold',
    'Avg. Event Capacity',
    'Avg. Capacity Sold',
    'Ticket Price Min',
    'Ticket Price Max',
    'Ticket Price Avg. USD',
    'sp followers',
    'sp popularity',
    'yt View Count',
    'yt Subscriber Count',
    'yt Video Count',
    'Month',
    'is_holiday',
    'day_of_week_1',
    'day_of_week_2',
    'day_of_week_3',
    'day_of_week_4',
    'day_of_week_5',
    'day_of_week_6'
]
target = 'Avg. Gross USD'


In [16]:
data_selected = data[['Event Date'] + features + [target]].dropna()
data_selected['Event Date'] = pd.to_datetime(data_selected['Event Date'])

In [17]:
numeric_features = [
    '# Shows',
    'Avg. Tickets Sold',
    'Avg. Event Capacity',
    'Avg. Capacity Sold',
    'Ticket Price Min',
    'Ticket Price Max',
    'Ticket Price Avg. USD',
    'sp followers',
    'sp popularity',
    'yt View Count',
    'yt Subscriber Count',
    'yt Video Count',
]

for column in numeric_features:
    data_selected[column] = data_selected[column].astype(str).str.replace(',', '').str.replace('%', '').astype(float)


In [18]:
data_selected['is_holiday'] = data_selected['is_holiday'].astype(int)

In [19]:
# Create interaction terms more efficiently
interaction_terms = pd.DataFrame(index=data_selected.index)
for feature1 in numeric_features:
    for feature2 in numeric_features:
        if feature1 != feature2:
            interaction_terms[f'{feature1} * {feature2}'] = data_selected[feature1] * data_selected[feature2]

data_selected_interactions = pd.concat([data_selected, interaction_terms], axis=1)

  interaction_terms[f'{feature1} * {feature2}'] = data_selected[feature1] * data_selected[feature2]
  interaction_terms[f'{feature1} * {feature2}'] = data_selected[feature1] * data_selected[feature2]
  interaction_terms[f'{feature1} * {feature2}'] = data_selected[feature1] * data_selected[feature2]
  interaction_terms[f'{feature1} * {feature2}'] = data_selected[feature1] * data_selected[feature2]
  interaction_terms[f'{feature1} * {feature2}'] = data_selected[feature1] * data_selected[feature2]
  interaction_terms[f'{feature1} * {feature2}'] = data_selected[feature1] * data_selected[feature2]
  interaction_terms[f'{feature1} * {feature2}'] = data_selected[feature1] * data_selected[feature2]
  interaction_terms[f'{feature1} * {feature2}'] = data_selected[feature1] * data_selected[feature2]
  interaction_terms[f'{feature1} * {feature2}'] = data_selected[feature1] * data_selected[feature2]
  interaction_terms[f'{feature1} * {feature2}'] = data_selected[feature1] * data_selected[feature2]


In [20]:
# Calculate the difference between consecutive rows
data_selected_diff = data_selected_interactions.diff().dropna()

# Ensure all feature names are strings
data_selected_diff.columns = data_selected_diff.columns.astype(str)

In [21]:
# Define the feature matrix X and the target vector y
X = data_selected_diff[features]
y = data_selected_diff[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, test_size=0.8, random_state=42)

In [22]:
# Create a pipeline with PolynomialFeatures, StandardScaler, and GradientBoostingRegressor
pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('scaler', StandardScaler()),
    ('gb', GradientBoostingRegressor(random_state=42))
])

In [23]:
# Define hyperparameters to try for Gradient Boosting
param_dist = {
    'gb__learning_rate': uniform(0.01, 0.3),
    'gb__max_depth': randint(3, 10),
    'gb__n_estimators': randint(100, 500),
    'gb__subsample': uniform(0.6, 0.4),
    'gb__min_samples_split': randint(2, 10),
    'gb__min_samples_leaf': randint(1, 4)
}

In [24]:
# Perform randomized search to find the best parameters
random_search = RandomizedSearchCV(pipeline, param_dist, n_iter=20, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
random_search.fit(X_train_sample, y_train_sample)

print(f"Best parameters: {random_search.best_params_}")
print(f"Best cross-validation score: {-random_search.best_score_}")

Best parameters: {'gb__learning_rate': 0.05680559213273095, 'gb__max_depth': 5, 'gb__min_samples_leaf': 3, 'gb__min_samples_split': 4, 'gb__n_estimators': 187, 'gb__subsample': 0.7334834444556088}
Best cross-validation score: 1612128017.6303527


In [25]:
# Use the best model for predictions
best_gradient_boosting = random_search.best_estimator_

In [26]:
# Evaluate the best model
y_train_pred = best_gradient_boosting.predict(X_train)
y_test_pred = best_gradient_boosting.predict(X_test)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")
print(f"Training RMSE: {train_rmse}")
print(f"Testing RMSE: {test_rmse}")
print(f"Training R²: {train_r2}")
print(f"Testing R²: {test_r2}")


Training MSE: 1203110550.3520772
Testing MSE: 1642487723.1109018
Training RMSE: 34685.884021487436
Testing RMSE: 40527.616795352056
Training R²: 0.9475540142386237
Testing R²: 0.9316882773728393


In [27]:
# Generate polynomial features for the full dataset
data_poly = best_gradient_boosting.named_steps['poly'].transform(data_selected_diff[features])
data_poly_scaled = best_gradient_boosting.named_steps['scaler'].transform(data_poly)

In [28]:
# Predict using the best Gradient Boosting model
predicted_gross_revenue = best_gradient_boosting.named_steps['gb'].predict(data_poly_scaled)

print("Last 5 Predicted Avg. Gross USD values:")
print(predicted_gross_revenue[-5:])

Last 5 Predicted Avg. Gross USD values:
[ 49266.35818514  54307.57418685 -56474.53408315 258575.94320344
  76265.2298096 ]


In [29]:
import joblib

# Fit the pipeline on the training data
pipeline.fit(X_train_sample, y_train_sample)

# Save the model, polynomial features, and scaler
joblib.dump(pipeline, 'best_gradient_boosting_pipeline.pkl')
print("Pipeline, including model, polynomial features, and scaler, saved.")


Pipeline, including model, polynomial features, and scaler, saved.


## Model without Ticket Sold, without Capacity Sold

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import uniform, randint

file_path = '/Users/zmy/Documents/A Round Ent/cleaned_data_allpop100k.csv'
data = pd.read_csv(file_path)

# Define the features and target, excluding 'Avg. Capacity Sold' and 'Avg. Tickets Sold'
features = [
    '# Shows',
    'Avg. Event Capacity',
    'Ticket Price Min',
    'Ticket Price Max',
    'Ticket Price Avg. USD',
    'sp followers',
    'sp popularity',
    'yt View Count',
    'yt Subscriber Count',
    'yt Video Count',
    'Month',
    'is_holiday',
    'day_of_week_1',
    'day_of_week_2',
    'day_of_week_3',
    'day_of_week_4',
    'day_of_week_5',
    'day_of_week_6'
]
target = 'Avg. Gross USD'

# Select and preprocess the data
data_selected = data[['Event Date'] + features + [target]].dropna()
data_selected['Event Date'] = pd.to_datetime(data_selected['Event Date'])

# Convert numeric features to float
numeric_features = [
    '# Shows',
    'Avg. Event Capacity',
    'Ticket Price Min',
    'Ticket Price Max',
    'Ticket Price Avg. USD',
    'sp followers',
    'sp popularity',
    'yt View Count',
    'yt Subscriber Count',
    'yt Video Count',
]

for column in numeric_features:
    data_selected[column] = data_selected[column].astype(str).str.replace(',', '').str.replace('%', '').astype(float)

data_selected['is_holiday'] = data_selected['is_holiday'].astype(int)

# Create interaction terms more efficiently
interaction_terms = pd.DataFrame(index=data_selected.index)
for feature1 in numeric_features:
    for feature2 in numeric_features:
        if feature1 != feature2:
            interaction_terms[f'{feature1} * {feature2}'] = data_selected[feature1] * data_selected[feature2]

data_selected_interactions = pd.concat([data_selected, interaction_terms], axis=1)

# Calculate the difference between consecutive rows
data_selected_diff = data_selected_interactions.diff().dropna()

# Ensure all feature names are strings
data_selected_diff.columns = data_selected_diff.columns.astype(str)

# Define the feature matrix X and the target vector y
X = data_selected_diff[features]
y = data_selected_diff[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, test_size=0.8, random_state=42)

# Create a pipeline with PolynomialFeatures, StandardScaler, and GradientBoostingRegressor
pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('scaler', StandardScaler()),
    ('gb', GradientBoostingRegressor(random_state=42))
])

# Define hyperparameters to try for Gradient Boosting
param_dist = {
    'gb__learning_rate': uniform(0.01, 0.3),
    'gb__max_depth': randint(3, 10),
    'gb__n_estimators': randint(100, 500),
    'gb__subsample': uniform(0.6, 0.4),
    'gb__min_samples_split': randint(2, 10),
    'gb__min_samples_leaf': randint(1, 4)
}

# Perform randomized search to find the best parameters
random_search = RandomizedSearchCV(pipeline, param_dist, n_iter=20, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
random_search.fit(X_train_sample, y_train_sample)

print(f"Best parameters: {random_search.best_params_}")
print(f"Best cross-validation score: {-random_search.best_score_}")

# Use the best model for predictions
best_gradient_boosting = random_search.best_estimator_

# Evaluate the best model
y_train_pred = best_gradient_boosting.predict(X_train)
y_test_pred = best_gradient_boosting.predict(X_test)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")
print(f"Training RMSE: {train_rmse}")
print(f"Testing RMSE: {test_rmse}")
print(f"Training R²: {train_r2}")
print(f"Testing R²: {test_r2}")

# Generate polynomial features for the full dataset
data_poly = best_gradient_boosting.named_steps['poly'].transform(data_selected_diff[features])
data_poly_scaled = best_gradient_boosting.named_steps['scaler'].transform(data_poly)

# Predict using the best Gradient Boosting model
predicted_gross_revenue = best_gradient_boosting.named_steps['gb'].predict(data_poly_scaled)

print("Last 5 Predicted Avg. Gross USD values:")
print(predicted_gross_revenue[-5:])


Best parameters: {'gb__learning_rate': 0.0646708263364187, 'gb__max_depth': 6, 'gb__min_samples_leaf': 2, 'gb__min_samples_split': 7, 'gb__n_estimators': 489, 'gb__subsample': 0.6831766651472755}
Best cross-validation score: 3502790826.356483
Training MSE: 3111302561.9198465
Testing MSE: 3976623167.681465
Training RMSE: 55779.05128199875
Testing RMSE: 63060.47230778933
Training R²: 0.864372122899238
Testing R²: 0.8346106488340216
Last 5 Predicted Avg. Gross USD values:
[ 31785.07966772 116700.86896071 -54148.25914986 219715.34865751
  65314.76003746]


IU Concert Avg. Revenue & Gross Revenue

In [31]:
import numpy as np

# IU's concert information
iu_concerts = {
    '# Shows': [4],
    'Avg. Event Capacity': [18890],
    'Ticket Price Min': [59],
    'Ticket Price Max': [219.5],
    'Ticket Price Avg. USD': [139.25],
    'sp followers': [5675418],
    'sp popularity': [0],  # Assuming this needs to be provided
    'yt View Count': [2992142616],
    'yt Subscriber Count': [9720000],
    'yt Video Count': [295],
    'Month': [7],  # Assuming these concerts are in July
    'is_holiday': [0],  # Assuming it's not a holiday
    'day_of_week_1': [1],  # Assuming the concert is on a Monday (one-hot encoded)
    'day_of_week_2': [1],  # Assuming the concert is on a Tuesday (one-hot encoded)
    'day_of_week_3': [0],  # Assuming the concert is not on a Wednesday
    'day_of_week_4': [1],  # Assuming the concert is on a Thursday (one-hot encoded)
    'day_of_week_5': [1],  # Assuming the concert is on a Friday (one-hot encoded)
    'day_of_week_6': [0],  # Assuming the concert is not on a Saturday
}

# Create a DataFrame
iu_df = pd.DataFrame(iu_concerts)

# Apply polynomial features and scaling
iu_poly = best_gradient_boosting.named_steps['poly'].transform(iu_df)
iu_poly_scaled = best_gradient_boosting.named_steps['scaler'].transform(iu_poly)

# Predict the sales revenue
iu_predicted_revenue = best_gradient_boosting.named_steps['gb'].predict(iu_poly_scaled)

print("Predicted Avg. Gross USD for IU's concerts:")
print(iu_predicted_revenue[0])

# Predicted Avg. Gross USD for IU's concerts
avg_gross_usd = iu_predicted_revenue[0]

# Total revenue calculation
number_of_shows = iu_concerts['# Shows'][0]
total_revenue = avg_gross_usd * number_of_shows

print(f"Predicted Avg. Gross USD per concert: {avg_gross_usd}")
print(f"Total Predicted Revenue for {number_of_shows} concerts: {total_revenue}")

Predicted Avg. Gross USD for IU's concerts:
457574.1079572082
Predicted Avg. Gross USD per concert: 457574.1079572082
Total Predicted Revenue for 4 concerts: 1830296.4318288327


Model Performance

In [32]:
import numpy as np

# Evaluate the best model
y_train_pred = best_gradient_boosting.predict(X_train)
y_test_pred = best_gradient_boosting.predict(X_test)

# Calculate MSE and RMSE for training and testing sets
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

# Calculate R² scores
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")
print(f"Training RMSE: {train_rmse}")
print(f"Testing RMSE: {test_rmse}")
print(f"Training R²: {train_r2}")
print(f"Testing R²: {test_r2}")


Training MSE: 3111302561.9198465
Testing MSE: 3976623167.681465
Training RMSE: 55779.05128199875
Testing RMSE: 63060.47230778933
Training R²: 0.864372122899238
Testing R²: 0.8346106488340216


## Model with Tickets Sold & Capacity
Assuming popularity is 80
Avg ticket price is 200

In [33]:
import pandas as pd
from datetime import datetime
import joblib

# Load the entire pipeline
loaded_pipeline = joblib.load('best_gradient_boosting_pipeline.pkl')

# Concert details
concerts = [
    {"date": "2024-07-22", "day_of_week": "Monday", "capacity": 20356, "city": "Washington, D.C."},
    {"date": "2024-07-25", "day_of_week": "Thursday", "capacity": 18500, "city": "Rosemont, IL"},
    {"date": "2024-07-30", "day_of_week": "Tuesday", "capacity": 19200, "city": "Oakland, CA"},
    {"date": "2024-08-02", "day_of_week": "Friday", "capacity": 17505, "city": "Inglewood, CA"}
]

# Features
avg_event_capacity = 18890
ticket_price_min = 59
ticket_price_max = 219.5
ticket_price_avg_usd = 200
yt_view_count = 2992142616
yt_subscriber_count = 9720000
yt_video_count = 295
sp_followers = 5675418
sp_popularity = 80  # Assuming popularity score
num_shows = 4

# Prepare data for prediction
data_for_prediction = pd.DataFrame(concerts)
data_for_prediction['Avg. Event Capacity'] = avg_event_capacity
data_for_prediction['Avg. Capacity Sold'] = data_for_prediction['capacity']  # Assuming full capacity sold
data_for_prediction['Avg. Tickets Sold'] = data_for_prediction['Avg. Capacity Sold']
data_for_prediction['Ticket Price Min'] = ticket_price_min
data_for_prediction['Ticket Price Max'] = ticket_price_max
data_for_prediction['Ticket Price Avg. USD'] = ticket_price_avg_usd
data_for_prediction['yt View Count'] = yt_view_count
data_for_prediction['yt Subscriber Count'] = yt_subscriber_count
data_for_prediction['yt Video Count'] = yt_video_count
data_for_prediction['sp followers'] = sp_followers
data_for_prediction['sp popularity'] = sp_popularity
data_for_prediction['# Shows'] = num_shows
data_for_prediction['Month'] = pd.to_datetime(data_for_prediction['date']).dt.month
data_for_prediction['is_holiday'] = 0  # Assuming no holidays for simplicity

# One-hot encode days of the week
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
for day in days_of_week:
    data_for_prediction[f'day_of_week_{days_of_week.index(day) + 1}'] = (data_for_prediction['day_of_week'] == day).astype(int)

# Drop unnecessary columns
data_for_prediction = data_for_prediction.drop(columns=['date', 'day_of_week', 'city'])

# Ensure feature order matches the model
features = [
    '# Shows',
    'Avg. Tickets Sold',
    'Avg. Event Capacity',
    'Avg. Capacity Sold',
    'Ticket Price Min',
    'Ticket Price Max',
    'Ticket Price Avg. USD',
    'sp followers',
    'sp popularity',
    'yt View Count',
    'yt Subscriber Count',
    'yt Video Count',
    'Month',
    'is_holiday',
    'day_of_week_1',
    'day_of_week_2',
    'day_of_week_3',
    'day_of_week_4',
    'day_of_week_5',
    'day_of_week_6'
]
data_for_prediction = data_for_prediction[features]

# Use the loaded pipeline to make predictions
predicted_gross_revenue = loaded_pipeline.predict(data_for_prediction)

# Display results
result_df = pd.DataFrame(concerts)
result_df['Predicted Avg. Gross USD'] = predicted_gross_revenue
print(result_df[['date', 'city', 'Predicted Avg. Gross USD']])

# Calculate the sum of the predicted gross revenue for the four concerts
total_predicted_revenue = predicted_gross_revenue.sum()
print(f"Total Predicted Gross Revenue for four concerts: ${total_predicted_revenue:,.2f}")


         date              city  Predicted Avg. Gross USD
0  2024-07-22  Washington, D.C.             525084.846262
1  2024-07-25      Rosemont, IL             525084.846262
2  2024-07-30       Oakland, CA             524004.713580
3  2024-08-02     Inglewood, CA             525084.846262
Total Predicted Gross Revenue for four concerts: $2,099,259.25


In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predictions for the training and testing sets
y_train_pred = loaded_pipeline.predict(X_train)
y_test_pred = loaded_pipeline.predict(X_test)

# Calculate MSE and RMSE for training and testing sets
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

# Calculate R² scores
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")
print(f"Training RMSE: {train_rmse}")
print(f"Testing RMSE: {test_rmse}")
print(f"Training R²: {train_r2}")
print(f"Testing R²: {test_r2}")


Training MSE: 1452242194.0099704
Testing MSE: 1767547211.2215025
Training RMSE: 38108.29560620588
Testing RMSE: 42042.20749700832
Training R²: 0.9366938695643322
Testing R²: 0.9264870031450324
