In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split

In [10]:
file_path = 'data/PlusSupplementaryFeatures_041924.csv'
data = pd.read_csv(file_path)

In [11]:
features = [
    '# Shows',
    'Avg. Event Capacity',
    'Ticket Price Min',
    'Ticket Price Max',
    # 'Ticket Price Avg. USD',
    'yt View Count',
    'yt Subscriber Count',
    'yt Video Count',
    'sp followers',
    'Asian_Percentage_City'
]
target = 'Avg. Gross USD'

In [12]:
data_selected = data[['Event Date', 'day_of_week'] + features + [target]].dropna()
data_selected['Event Date'] = pd.to_datetime(data_selected['Event Date'])
data_selected = data_selected.sort_values(by='Event Date')
columns_with_commas = ['Avg. Event Capacity', 'Ticket Price Max']

for column in columns_with_commas:
    data_selected[column] = data_selected[column].str.replace(',', '').astype(float)

data_selected = pd.get_dummies(data_selected, columns=['day_of_week'], drop_first=True)

encoded_columns = list(pd.get_dummies(data['day_of_week'], drop_first=True).columns)
for col in encoded_columns:
    if col not in data_selected.columns:
        data_selected[col] = 0

interaction_terms = pd.DataFrame(index=data_selected.index)
for feature1 in features:
    for feature2 in features:
        if feature1 != feature2:
            interaction_terms[f'{feature1} * {feature2}'] = data_selected[feature1] * data_selected[feature2]

data_selected = pd.concat([data_selected, interaction_terms], axis=1)

data_diff = data_selected.diff().dropna()

print("Shape of data_diff after diff and dropna:", data_diff.shape)

encoded_features = list(data_diff.columns)
encoded_features.remove('Event Date')
encoded_features.remove(target)

data_diff.columns = data_diff.columns.astype(str)
encoded_features = [str(feature) for feature in encoded_features]

X = data_diff[encoded_features]
y = data_diff[target]

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

print("Shape of X_poly:", X_poly.shape)

scaler = StandardScaler()
X_poly_scaled = scaler.fit_transform(X_poly)

X_train, X_test, y_train, y_test = train_test_split(X_poly_scaled, y, test_size=0.2, random_state=42)

Shape of data_diff after diff and dropna: (212, 95)
Shape of X_poly: (212, 4464)


In [13]:
gbr = GradientBoostingRegressor(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)
gbr.fit(X_train, y_train)

In [26]:
def predict(capacity, price_min, price_max, view_count, subscriber_count, video_count, sp_followers, asian_percentage_city, day_of_week):
    new_concert = pd.DataFrame({
        '# Shows': [1]*len(capacity),
        'Avg. Event Capacity': capacity,
        'Ticket Price Min': price_min,
        'Ticket Price Max': price_max,
        'yt View Count': view_count,
        'yt Subscriber Count': subscriber_count,
        'yt Video Count': video_count,
        'sp followers': sp_followers,
        'Asian_Percentage_City': asian_percentage_city,
        'day_of_week_1': np.array(day_of_week)==1,
        'day_of_week_2': np.array(day_of_week)==2,
        'day_of_week_3': np.array(day_of_week)==3,
        'day_of_week_4': np.array(day_of_week)==4,
        'day_of_week_5': np.array(day_of_week)==5,
        'day_of_week_6': np.array(day_of_week)==6
    })

    for col in encoded_columns:
        if col not in new_concert.columns:
            new_concert[col] = 0

    interaction_terms = pd.DataFrame(index=new_concert.index)
    for feature1 in features:
        for feature2 in features:
            if feature1 != feature2:
                interaction_terms[f'{feature1} * {feature2}'] = new_concert[feature1] * new_concert[feature2]

    new_concert = pd.concat([new_concert, interaction_terms], axis=1)

    new_concert.columns = new_concert.columns.astype(str)

    new_concert_poly = poly.transform(new_concert)

    new_concert_poly_scaled = scaler.transform(new_concert_poly)

    predicted_gross_revenue_gbr = gbr.predict(new_concert_poly_scaled)

    return predicted_gross_revenue_gbr.tolist()

In [52]:
predict([5600], [45], [455], [261039702], [1480000], [26], [1052438], [14], [1])

[456084.2774576204]

## Tableau

In [29]:
import tabpy_client

In [30]:
connection = tabpy_client.Client ( 'http://localhost:9004/')
connection.deploy('gbr', predict,'Return the prediction of gbr', override = True)

In [16]:
# connection.remove('gbr')