In [13]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
from urllib.parse import quote_plus
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

# Load database configuration
with open('db.json', 'r') as f:
    db_config = json.load(f)

print("Database configuration loaded:")
print(f"Host: {db_config['host']}")
print(f"User: {db_config['user']}")
print(f"Database: {db_config['database']}")
print("Password: [HIDDEN]")

# Build SQLAlchemy connection string
user = db_config['user']
password = db_config['password']
host = db_config['host']
database = db_config['database']

# URL-encode the password to handle special characters
encoded_password = quote_plus(password)
connection_string = f"mysql+mysqlconnector://{user}:{encoded_password}@{host}/{database}"

# Create SQLAlchemy engine
engine = create_engine(connection_string)

# Test the connection
try:
    with engine.connect() as test_conn:
        print("✓ Database connection successful!")
except Exception as e:
    print(f"❌ Database connection failed: {e}")
    raise

Database configuration loaded:
Host: 192.168.68.74
User: root
Database: concert
Password: [HIDDEN]
✓ Database connection successful!


In [14]:
# Query to get all Toronto shows data
toronto_query = """
SELECT 
    id,
    artist,
    venue,
    event_name,
    event_date,
    city,
    section,
    row_name,
    seat,
    priceWithFees,
    price,
    faceValue,
    ticketClassName,
    event_type,
    performer_type,
    performer,
    updated_date
FROM CONCERT_SEATS 
WHERE city = 'Toronto'
ORDER BY event_date, artist, priceWithFees
"""

print("Querying Toronto shows data...")
toronto_shows = pd.read_sql(toronto_query, engine)

print(f"✓ Loaded {len(toronto_shows):,} Toronto concert records")
print(f"Date range: {toronto_shows['event_date'].min()} to {toronto_shows['event_date'].max()}")
print(f"Unique artists: {toronto_shows['artist'].nunique()}")
print(f"Unique venues: {toronto_shows['venue'].nunique()}")
print(f"Unique events: {toronto_shows['event_name'].nunique()}")

# Display basic info about the dataset
print("\n=== DATASET OVERVIEW ===")
print(toronto_shows.info())

print("\n=== SAMPLE DATA ===")
print(toronto_shows.head())

Querying Toronto shows data...
✓ Loaded 37,013 Toronto concert records
Date range: 2025-02-20 to 2026-03-20
Unique artists: 314
Unique venues: 76
Unique events: 530

=== DATASET OVERVIEW ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37013 entries, 0 to 37012
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   id               37013 non-null  float64       
 1   artist           37013 non-null  object        
 2   venue            37013 non-null  object        
 3   event_name       37013 non-null  object        
 4   event_date       37013 non-null  object        
 5   city             37013 non-null  object        
 6   section          37013 non-null  object        
 7   row_name         32094 non-null  object        
 8   seat             24679 non-null  object        
 9   priceWithFees    26492 non-null  float64       
 10  price            37013 non-null  float64       
 11  faceVa

In [15]:
# Import additional libraries for machine learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

print("\n=== FACE VALUE IMPUTATION WITH RANDOM FOREST ===")
# Filter for Rogers Stadium only
# Filter for Coldplay shows at Rogers Stadium only
toronto_shows = toronto_shows[toronto_shows['venue'].str.contains('Rogers Stadium', case=False, na=False)].copy()

# Remove outliers in faceValue using IQR
q1 = toronto_shows['faceValue'].quantile(0.25)
q3 = toronto_shows['faceValue'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
toronto_shows = toronto_shows[(toronto_shows['faceValue'].isna()) | ((toronto_shows['faceValue'] >= lower_bound) & (toronto_shows['faceValue'] <= upper_bound))]
# Analyze missing face values
missing_facevalue = toronto_shows['faceValue'].isna() | (toronto_shows['faceValue'] == 0)
print(f"Rows with missing/zero faceValue: {missing_facevalue.sum():,} ({missing_facevalue.mean()*100:.1f}%)")
print(f"Rows with valid faceValue: {(~missing_facevalue).sum():,}")
print(f"Total rows in toronto_shows: {len(toronto_shows):,}")
if missing_facevalue.sum() > 0:
    # Prepare data for Random Forest
    # Create a copy to work with
    df_ml = toronto_shows.copy()
    
    # Encode categorical variables
    le_section = LabelEncoder()
    le_venue = LabelEncoder()
    le_artist = LabelEncoder()
    
    # Handle missing values in categorical columns
    df_ml['section'] = df_ml['section'].fillna('Unknown')
    df_ml['row_name'] = df_ml['row_name'].fillna('Unknown')
    df_ml['venue'] = df_ml['venue'].fillna('Unknown')
    df_ml['artist'] = df_ml['artist'].fillna('Unknown')
    
    # Encode categorical variables
    df_ml['section_encoded'] = le_section.fit_transform(df_ml['section'])
    df_ml['venue_encoded'] = le_venue.fit_transform(df_ml['venue'])
    df_ml['artist_encoded'] = le_artist.fit_transform(df_ml['artist'])
    
    # Extract row number from row_name (if it contains numbers)
    df_ml['row_number'] = df_ml['row_name'].str.extract('(\d+)').astype(float)
    df_ml['row_number'] = df_ml['row_number'].fillna(0)
    
    # Prepare features for the model
    feature_columns = ['section_encoded', 'venue_encoded', 'artist_encoded', 'row_number']
    
    # Add price information if available (can help predict face value)
    if 'priceWithFees' in df_ml.columns:
        # Convert priceWithFees to numeric first, handling string values
        df_ml['priceWithFees'] = pd.to_numeric(df_ml['priceWithFees'], errors='coerce')
        df_ml['priceWithFees_clean'] = df_ml['priceWithFees'].fillna(df_ml['priceWithFees'].median())
        feature_columns.append('priceWithFees_clean')
    
    # Split data into training (valid face values) and prediction (missing face values)
    train_data = df_ml[~missing_facevalue].copy()
    predict_data = df_ml[missing_facevalue].copy()
    
    if len(train_data) > 0:
        # Prepare training data
        X_train = train_data[feature_columns]
        y_train = train_data['faceValue']
        
        # Split for validation
        X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )
        
        # Train Random Forest model
        print("Training Random Forest model...")
        rf_model = RandomForestRegressor(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        )
        
        rf_model.fit(X_train_split, y_train_split)
        
        # Validate model performance
        y_val_pred = rf_model.predict(X_val_split)
        mae = mean_absolute_error(y_val_split, y_val_pred)
        r2 = r2_score(y_val_split, y_val_pred)
        
        print(f"Model validation results:")
        print(f"  Mean Absolute Error: ${mae:.2f}")
        print(f"  R² Score: {r2:.3f}")
        
        # Feature importance
        feature_importance = pd.DataFrame({
            'feature': feature_columns,
            'importance': rf_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\nFeature importance:")
        for _, row in feature_importance.iterrows():
            print(f"  {row['feature']}: {row['importance']:.3f}")
        
        # Predict missing face values
        if len(predict_data) > 0:
            print(f"\nPredicting {len(predict_data):,} missing face values...")
            X_predict = predict_data[feature_columns]
            predicted_facevalues = rf_model.predict(X_predict)
            
            # Ensure predictions are positive
            predicted_facevalues = np.maximum(predicted_facevalues, 1.0)
            
            # Update the original dataframe
            toronto_shows.loc[missing_facevalue, 'faceValue'] = predicted_facevalues
            
            print(f"✓ Imputed face values for {len(predict_data):,} rows")
            print(f"  Predicted faceValue range: ${predicted_facevalues.min():.2f} - ${predicted_facevalues.max():.2f}")
            print(f"  Mean predicted faceValue: ${predicted_facevalues.mean():.2f}")
            
            # Add a flag to track imputed values
            toronto_shows['faceValue_imputed'] = missing_facevalue
            
        else:
            print("No missing face values to predict.")
    else:
        print("❌ No valid face values available for training the model.")
else:
    print("✓ No missing face values found - no imputation needed.")

# Final summary
final_missing = toronto_shows['faceValue'].isna() | (toronto_shows['faceValue'] == 0)
print(f"\nFinal status:")
print(f"Remaining missing/zero faceValues: {final_missing.sum():,}")
print(f"Valid faceValues: {(~final_missing).sum():,}")


=== FACE VALUE IMPUTATION WITH RANDOM FOREST ===
Rows with missing/zero faceValue: 2,058 (49.3%)
Rows with valid faceValue: 2,113
Total rows in toronto_shows: 4,171
Training Random Forest model...
Model validation results:Model validation results:
  Mean Absolute Error: $26.90
  R² Score: 0.599

Feature importance:
  section_encoded: 0.514
  row_number: 0.198
  priceWithFees_clean: 0.169
  artist_encoded: 0.104
  venue_encoded: 0.015

Predicting 2,058 missing face values...
✓ Imputed face values for 2,058 rows
  Predicted faceValue range: $58.60 - $369.36
  Mean predicted faceValue: $177.32

Final status:
Remaining missing/zero faceValues: 0
Valid faceValues: 4,171

  Mean Absolute Error: $26.90
  R² Score: 0.599

Feature importance:
  section_encoded: 0.514
  row_number: 0.198
  priceWithFees_clean: 0.169
  artist_encoded: 0.104
  venue_encoded: 0.015

Predicting 2,058 missing face values...
✓ Imputed face values for 2,058 rows
  Predicted faceValue range: $58.60 - $369.36
  Mean pre

In [16]:
# Filter data for Coldplay events
coldplay_shows = toronto_shows[
    (toronto_shows['artist'].str.lower() == 'coldplay') &
    (toronto_shows['venue'].str.contains('Rogers Stadium', case=False, na=False))
]
# Remove rows where priceWithFees is above Q3 + 1.5*IQR
q1 = coldplay_shows['priceWithFees'].quantile(0.25)
q3 = coldplay_shows['priceWithFees'].quantile(0.75)
iqr = q3 - q1
upper_bound = q3 + 1.5 * iqr
coldplay_shows = coldplay_shows[coldplay_shows['priceWithFees'] <= upper_bound]
print(f"Number of Coldplay records: {len(coldplay_shows)}")
print(f"Date range: {coldplay_shows['event_date'].min()} to {coldplay_shows['event_date'].max()}")
print(f"Venues: {coldplay_shows['venue'].unique()}")

# Basic statistics on ticket prices
print("\n=== Coldplay Ticket Price Statistics ===")
print(coldplay_shows[['priceWithFees', 'price', 'faceValue']].describe())

# Calculate averages after removing outliers, grouped by section
section_stats = coldplay_shows.groupby('section').agg(
    count=('faceValue', 'count'),
    avg_facevalue=('faceValue', 'mean'),
    avg_price=('price', 'mean'),
    avg_price_facevalue_ratio=('price', lambda x: (x / coldplay_shows.loc[x.index, 'faceValue']).mean())
).reset_index()


# Remove sections with count less than 10
section_stats = section_stats[section_stats['count'] >= 10].reset_index(drop=True)
print("\n=== Averages after removing outliers (grouped by section) ===")
print(section_stats)


Number of Coldplay records: 735
Date range: 2025-07-07 to 2025-07-12
Venues: ['Rogers Stadium Toronto (Concert Venue)'
 'Rogers Stadium (Concert Venue, Canada)']

=== Coldplay Ticket Price Statistics ===
       priceWithFees        price   faceValue
count     735.000000   735.000000  735.000000
mean      817.023129   684.945578  159.570082
std       176.983460   141.220921   83.489313
min       442.000000   442.000000   48.370000
25%       703.500000   585.000000   93.130000
50%       798.000000   650.000000  135.477060
75%       900.000000   750.000000  209.936805
max      1315.000000  1136.000000  425.000000

=== Averages after removing outliers (grouped by section) ===
   section  count  avg_facevalue   avg_price  avg_price_facevalue_ratio
0      101     65     192.563562  614.415385                   4.412598
1      102     36     225.747980  693.055556                   3.284033
2      103     10     219.131785  866.800000                   4.909593
3      105     15     192.72323

In [20]:
# Filter data for Blackpink events at Rogers Stadium
blackpink_shows = toronto_shows[
    (toronto_shows['artist'].str.lower().str.contains('black pink')) &
    (toronto_shows['venue'].str.contains('Rogers Stadium', case=False, na=False))
].copy()

# Remove rows where priceWithFees is above Q3 + 1.5*IQR (outlier removal)
if not blackpink_shows.empty:
    q1 = blackpink_shows['priceWithFees'].quantile(0.25)
    q3 = blackpink_shows['priceWithFees'].quantile(0.75)
    iqr = q3 - q1
    upper_bound = q3 + 1.5 * iqr
    blackpink_shows = blackpink_shows[blackpink_shows['priceWithFees'] <= upper_bound]

    print(f"Number of Blackpink records: {len(blackpink_shows)}")
    print(f"Date range: {blackpink_shows['event_date'].min()} to {blackpink_shows['event_date'].max()}")
    print(f"Venues: {blackpink_shows['venue'].unique()}")

    # Basic statistics on ticket prices
    print("\n=== Blackpink Ticket Price Statistics ===")
    print(blackpink_shows[['priceWithFees', 'price', 'faceValue']].describe())

    # Calculate averages after removing outliers, grouped by section
    blackpink_section_stats = blackpink_shows.groupby('section').agg(
        count=('faceValue', 'count'),
        avg_facevalue=('faceValue', 'mean'),
        avg_price=('price', 'mean'),
        avg_price_facevalue_ratio=('price', lambda x: (x / blackpink_shows.loc[x.index, 'faceValue']).mean())
    ).reset_index()
    # Order by avg_price_facevalue_ratio descending
    blackpink_section_stats = blackpink_section_stats.sort_values('avg_price_facevalue_ratio', ascending=False)
    # Remove sections with count less than 10
    blackpink_section_stats = blackpink_section_stats[blackpink_section_stats['count'] >= 10].reset_index(drop=True)
    print("\n=== Averages after removing outliers (grouped by section) ===")
    print(blackpink_section_stats)
else:
    print("No Blackpink records found at Rogers Stadium.")

Number of Blackpink records: 1047
Date range: 2025-07-22 to 2025-07-23
Venues: ['Rogers Stadium Toronto (Concert Venue)'
 'Rogers Stadium (Concert Venue, Canada)']

=== Blackpink Ticket Price Statistics ===
       priceWithFees        price    faceValue
count    1047.000000  1047.000000  1047.000000
mean      215.310411   229.805158   177.338592
std       174.452764   141.239035    69.062888
min         0.000000    84.000000    60.000000
25%         0.000000   144.000000   117.884809
50%       219.000000   190.000000   170.953074
75%       336.000000   295.000000   222.143655
max       866.000000  2358.000000   399.000000

=== Averages after removing outliers (grouped by section) ===
    section  count  avg_facevalue   avg_price  avg_price_facevalue_ratio
0   119 WCR     14     154.575815  252.214286                   1.643726
1       112     27     109.276147  174.296296                   1.625143
2       123     27     225.879695  352.518519                   1.573299
3       111    

In [21]:
# Group by artist and section for all artists at Rogers Stadium
artists_at_rogers = toronto_shows[
    toronto_shows['venue'].str.contains('Rogers Stadium', case=False, na=False)
].copy()

# Remove outliers in priceWithFees for each artist separately
def remove_outliers_by_artist(df):
    cleaned = []
    for artist, group in df.groupby('artist'):
        q1 = group['priceWithFees'].quantile(0.25)
        q3 = group['priceWithFees'].quantile(0.75)
        iqr = q3 - q1
        upper_bound = q3 + 1.5 * iqr
        cleaned_group = group[group['priceWithFees'] <= upper_bound]
        cleaned.append(cleaned_group)
    return pd.concat(cleaned, axis=0)

artists_at_rogers_clean = remove_outliers_by_artist(artists_at_rogers)

# Calculate statistics grouped by artist and section
artist_section_stats = artists_at_rogers_clean.groupby(['artist', 'section']).agg(
    count=('faceValue', 'count'),
    avg_facevalue=('faceValue', 'mean'),
    avg_price=('price', 'mean'),
    avg_price_facevalue_ratio=('price', lambda x: (x / artists_at_rogers_clean.loc[x.index, 'faceValue']).mean())
).reset_index()

# Remove groups with count less than 10 for robustness
artist_section_stats = artist_section_stats[artist_section_stats['count'] >= 10].reset_index(drop=True)

print("=== Averages after removing outliers (grouped by artist and section) ===")
print(artist_section_stats)

=== Averages after removing outliers (grouped by artist and section) ===
        artist section  count  avg_facevalue   avg_price  \
0   Black Pink     101     10     227.940743  191.800000   
1   Black Pink     105     13     222.934817  324.615385   
2   Black Pink     106     12     168.534016  130.500000   
3   Black Pink     107     11     148.801816  122.636364   
4   Black Pink     116     12     129.299535  106.500000   
..         ...     ...    ...            ...         ...   
83  stray kids     115     11      81.049724  123.818182   
84  stray kids     116     32     102.253817  121.406250   
85  stray kids     117     24      98.023472  119.208333   
86  stray kids     118     24     102.794859  109.208333   
87  stray kids     119     11     133.008381  162.454545   

    avg_price_facevalue_ratio  
0                    0.861866  
1                    1.417805  
2                    0.798851  
3                    0.830897  
4                    0.875496  
..            

In [23]:
# Calculate MEDIAN statistics grouped by section for all Toronto shows (not just Coldplay)
# Calculate AVERAGE statistics grouped by artist for all Toronto shows
artist_stats_avg_all = toronto_shows.groupby('artist').agg(
    count=('faceValue', 'count'),
    avg_facevalue=('faceValue', 'mean'),
    avg_price=('price', 'mean'),
    avg_priceWithFees=('priceWithFees', 'mean'),
    avg_price_facevalue_ratio=('price', lambda x: (x / toronto_shows.loc[x.index, 'faceValue']).mean())
).reset_index()

# Remove artists with count less than 10
artist_stats_avg_all = artist_stats_avg_all[artist_stats_avg_all['count'] >= 10].reset_index(drop=True)

# Sort by count descending
artist_stats_avg_all = artist_stats_avg_all.sort_values('avg_price_facevalue_ratio', ascending=False)

print("\n=== MEDIAN statistics for all Toronto shows (grouped by section) ===")
print(artist_stats_avg_all)


=== MEDIAN statistics for all Toronto shows (grouped by section) ===
               artist  count  avg_facevalue   avg_price  avg_priceWithFees  \
6            ColdPlay   1117     168.618489  724.850492         894.946429   
4   Chris avant garde     43     178.125279  571.953488         667.432432   
7            Coldplay    144     194.180288  396.743056                NaN   
9               Oasis    716     180.204397  360.780726         477.562061   
11         stray kids    411     126.601807  187.406326         235.621019   
10         black pink   1011     174.620342  274.230465         355.287841   
1   Chris Avant Garde    194     159.210313  173.051546           0.000000   
8          Gigi leung     22     152.525942  148.863636         147.500000   
5       Chris lorenzo     29     125.803541  121.275862                NaN   
2       Chris Lorenzo     11     116.026417  110.909091                NaN   
0          Black Pink    404     168.431877  152.405941           0.0000