In [2]:
import lightgbm as lgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('/Users/zhangyuchuan/Downloads/allgenre_combined_df.csv')
filtered_data = df[
    (df['Year'] >= 2020) &
    (~(df['Headliner'].str.contains('"', na=False))) &
    (df['Genre'] != 'Family Entertainment') &
    (df['Ticket Price Min USD'] > 0) &
    (df['Ticket Price Min USD'] < df['Ticket Price Max USD'])
]
filtered_data_no_na = filtered_data.dropna()
_, test_data = train_test_split(filtered_data_no_na, test_size=0.3, random_state=42)
train_data = filtered_data.loc[~filtered_data.index.isin(test_data.index)]

# Define features and target
feature_columns = ['Avg. Event Capacity', 'Ticket Price Min USD', 'Ticket Price Max USD', 'headliner_monthly_listeners','sp followers', 'sp popularity','yt Subscriber Count', 'yt View Count','Total population','monthly_listeners']
X_train = train_data[feature_columns]
y_train = train_data['Avg. Gross USD']
X_test = test_data[feature_columns]
y_test = test_data['Avg. Gross USD']

# Initialize LightGBM Regressor
lgb_model = lgb.LGBMRegressor(
    boosting_type='gbdt',
    num_leaves=31,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation for RMSE
cv_rmse_scores_lgb = cross_val_score(
    lgb_model, X_train, y_train, cv=kf, scoring='neg_root_mean_squared_error'
)
mean_cv_rmse_lgb = -np.mean(cv_rmse_scores_lgb)
print("LightGBM Cross-validation Mean RMSE:", mean_cv_rmse_lgb)

# Perform cross-validation for R²
cv_r2_scores_lgb = cross_val_score(lgb_model, X_train, y_train, cv=kf, scoring='r2')
mean_cv_r2_lgb = np.mean(cv_r2_scores_lgb)
print("LightGBM Cross-validation Mean R²:", mean_cv_r2_lgb)

# Fit the model on training data
lgb_model.fit(X_train, y_train)

# Predict on testing data
y_pred_lgb = lgb_model.predict(X_test)

# Calculate performance metrics
rmse_lgb = mean_squared_error(y_test, y_pred_lgb, squared=False)
r2_lgb = r2_score(y_test, y_pred_lgb)
print(f"LightGBM Test RMSE: {rmse_lgb:.2f}")
print(f"LightGBM Test R²: {r2_lgb:.3f}")

  df = pd.read_csv('/Users/zhangyuchuan/Downloads/allgenre_combined_df.csv')


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001127 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2358
[LightGBM] [Info] Number of data points in the train set: 62945, number of used features: 10
[LightGBM] [Info] Start training from score 230815.511172
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001150 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2356
[LightGBM] [Info] Number of data points in the train set: 62945, number of used features: 10
[LightGBM] [Info] Start training from score 231140.350582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is

