In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")

In [2]:
# Data Loading
BASE_DIR = '/kaggle/input/Cinema_Audience_Forecasting_challenge/'

booknow_visits = pd.read_csv(BASE_DIR + 'booknow_visits/booknow_visits.csv', parse_dates=["show_date"])
booknow_booking = pd.read_csv(BASE_DIR + 'booknow_booking/booknow_booking.csv', parse_dates=["show_datetime", "booking_datetime"])
cinePOS_booking = pd.read_csv(BASE_DIR + 'cinePOS_booking/cinePOS_booking.csv', parse_dates=["show_datetime", "booking_datetime"])
booknow_theaters = pd.read_csv(BASE_DIR + 'booknow_theaters/booknow_theaters.csv')
cinePOS_theaters = pd.read_csv(BASE_DIR + 'cinePOS_theaters/cinePOS_theaters.csv')
date_info = pd.read_csv(BASE_DIR + 'date_info/date_info.csv', parse_dates=["show_date"])
sample_submission = pd.read_csv(BASE_DIR + 'sample_submission/sample_submission.csv') 
movie_theater_id_relation = pd.read_csv(BASE_DIR + 'movie_theater_id_relation/movie_theater_id_relation.csv')

In [3]:
# Data Cleaning
def clean_data(booknow_visits, booknow_booking, cinepos_booking, booknow_theaters, cinepos_theaters):
    booknow_theaters_cleaned = booknow_theaters.copy()
    cinepos_theaters_cleaned = cinepos_theaters.copy()
    booknow_visits_cleaned = booknow_visits.copy()
    cinepos_booking_cleaned = cinepos_booking.copy()
    booknow_booking_cleaned = booknow_booking.copy()
    
    # Dropping Nan rows 
    booknow_theaters_cleaned.dropna(subset=['book_theater_id'], inplace=True) # 829 rows to 314 rows. Dropping 515 rows which has missing ids
    
    # Dropping redundant cols
    booknow_theaters_cleaned.drop(columns=['latitude', 'longitude'], inplace=True) # Latitude and longitude are approximate values and location is already captured in theater area col. 
    cinepos_theaters_cleaned.drop(columns=['latitude', "longitude"], inplace=True)  

    # Remove duplicates
    booknow_visits_cleaned = booknow_visits.drop_duplicates() # 10 duplicates
    booknow_booking_cleaned = booknow_booking.drop_duplicates() # 2042 duplicates
    cinepos_booking_cleaned = cinepos_booking.drop_duplicates() # 12541 duplicates

    return (
        booknow_visits_cleaned,
        booknow_booking_cleaned,
        cinepos_booking_cleaned,
        booknow_theaters_cleaned,
        cinepos_theaters_cleaned,
    )

booknow_visits_cleaned, booknow_booking_cleaned, cinePOS_booking_cleaned, booknow_theaters_cleaned, cinePOS_theaters_cleaned = clean_data(booknow_visits, booknow_booking, cinePOS_booking, booknow_theaters, cinePOS_theaters) 

In [4]:
# Preparing datasets
booknow_booking_cleaned['show_date'] = pd.to_datetime(booknow_booking_cleaned['show_datetime'])
bookings = booknow_booking_cleaned.groupby(['book_theater_id', 'show_date'])['tickets_booked'].sum().reset_index()
visits = booknow_visits_cleaned.groupby(['book_theater_id', 'show_date'])['audience_count'].max().reset_index()

In [5]:
# Capping outliers
for tid in tqdm(visits['book_theater_id'].unique(), desc="Capping"):
    mask = visits['book_theater_id'] == tid
    data = visits.loc[mask, 'audience_count']
    if len(data) > 10:
        lower = data.quantile(0.01)
        upper = data.quantile(0.99)
        visits.loc[mask, 'audience_count'] = data.clip(lower, upper)

visits = visits.sort_values(['book_theater_id', 'show_date']).reset_index(drop=True)

Capping: 100%|██████████| 826/826 [00:11<00:00, 69.68it/s]


In [6]:
# Preparing features
visits['dow'] = visits['show_date'].dt.dayofweek
visits['month'] = visits['show_date'].dt.month
visits['quarter'] = visits['show_date'].dt.quarter
visits['is_weekend'] = visits['dow'].isin([5, 6]).astype(int)
visits['is_month_start'] = visits['show_date'].dt.day <= 7
visits['is_month_end'] = visits['show_date'].dt.day >= 24

In [7]:
# Merge bookings data
visits = visits.merge(bookings, on=['book_theater_id', 'show_date'], how='left')
visits['tickets_booked'] = visits['tickets_booked'].fillna(0)

In [8]:
# Train validation split
cutoff_date = visits['show_date'].max() - pd.DateOffset(months=2)
train_df = visits[visits['show_date'] <= cutoff_date].copy()
val_df = visits[visits['show_date'] > cutoff_date].copy()

print(f"Train: {train_df['show_date'].min()} to {train_df['show_date'].max()} ({len(train_df):,} rows)")
print(f"Val:   {val_df['show_date'].min()} to {val_df['show_date'].max()} ({len(val_df):,} rows)")

Train: 2023-01-01 00:00:00 to 2023-12-28 00:00:00 (172,796 rows)
Val:   2023-12-29 00:00:00 to 2024-02-28 00:00:00 (41,075 rows)


In [9]:
# Preparing statistical features on train data
th_mean = train_df.groupby('book_theater_id')['audience_count'].mean().to_dict()
th_median = train_df.groupby('book_theater_id')['audience_count'].median().to_dict()
th_dow_mean = train_df.groupby(['book_theater_id', 'dow'])['audience_count'].mean().to_dict()
th_dow_median = train_df.groupby(['book_theater_id', 'dow'])['audience_count'].median().to_dict()
th_month_mean = train_df.groupby(['book_theater_id', 'month'])['audience_count'].mean().to_dict()
th_month_median = train_df.groupby(['book_theater_id', 'month'])['audience_count'].median().to_dict()
th_quarter_mean = train_df.groupby(['book_theater_id', 'quarter'])['audience_count'].mean().to_dict()
th_weekend_mean = train_df.groupby(['book_theater_id', 'is_weekend'])['audience_count'].mean().to_dict()
th_month_dow_mean = train_df.groupby(['book_theater_id', 'month', 'dow'])['audience_count'].mean().to_dict()
th_month_weekend_mean = train_df.groupby(['book_theater_id', 'month', 'is_weekend'])['audience_count'].mean().to_dict()
th_quarter_dow_mean = train_df.groupby(['book_theater_id', 'quarter', 'dow'])['audience_count'].mean().to_dict()
th_month_start_mean = train_df.groupby(['book_theater_id', 'is_month_start'])['audience_count'].mean().to_dict()

glob_mean = train_df['audience_count'].mean()
glob_dow_mean = train_df.groupby('dow')['audience_count'].mean().to_dict()
glob_month_mean = train_df.groupby('month')['audience_count'].mean().to_dict()
glob_month_dow_mean = train_df.groupby(['month', 'dow'])['audience_count'].mean().to_dict()
glob_quarter_mean = train_df.groupby('quarter')['audience_count'].mean().to_dict()
glob_quarter_dow_mean = train_df.groupby(['quarter', 'dow'])['audience_count'].mean().to_dict()

In [10]:
same_dow_history = {}

for tid in train_df['book_theater_id'].unique():
    th_data = train_df[train_df['book_theater_id'] == tid].copy()
    same_dow_history[tid] = {}
    
    for dow in range(7):
        dow_data = th_data[th_data['dow'] == dow].sort_values('show_date')
        
        if len(dow_data) >= 4:
            vals = dow_data.tail(4)['audience_count'].values
            weights = np.array([0.4, 0.3, 0.2, 0.1])
            weighted_avg = np.sum(vals * weights)
            
            same_dow_history[tid][dow] = {
                'lag_7': vals[-1],
                'lag_14': vals[-2] if len(vals) >= 2 else vals[-1],
                'lag_21': vals[-3] if len(vals) >= 3 else vals[-1],
                'lag_28': vals[-4] if len(vals) >= 4 else vals[-1],
                'mean_4w': np.mean(vals),
                'median_4w': np.median(vals),
                'weighted_4w': weighted_avg,
                'std_4w': np.std(vals) if len(vals) > 1 else 0
            }
        elif len(dow_data) > 0:
            baseline = dow_data['audience_count'].mean()
            same_dow_history[tid][dow] = {
                'lag_7': baseline, 
                'lag_14': baseline, 
                'lag_21': baseline, 
                'lag_28': baseline,
                'mean_4w': baseline, 
                'median_4w': baseline, 
                'weighted_4w': baseline, 
                'std_4w': 0
            }

recent_7d = {}
recent_14d = {}
recent_21d = {}
recent_28d = {}

for tid in train_df['book_theater_id'].unique():
    th_data = train_df[train_df['book_theater_id'] == tid].sort_values('show_date')
    baseline = th_mean.get(tid, glob_mean)
    recent_7d[tid] = th_data.tail(7)['audience_count'].mean() if len(th_data) >= 7 else baseline
    recent_14d[tid] = th_data.tail(14)['audience_count'].mean() if len(th_data) >= 14 else baseline
    recent_21d[tid] = th_data.tail(21)['audience_count'].mean() if len(th_data) >= 21 else baseline
    recent_28d[tid] = th_data.tail(28)['audience_count'].mean() if len(th_data) >= 28 else baseline

theater_data_count = train_df.groupby('book_theater_id').size().to_dict()

In [11]:
def predict_conservative(tid, dow, month, quarter, is_weekend, is_month_start):
    th_m = th_mean.get(tid, glob_mean)
    th_dow_m = th_dow_mean.get((tid, dow), th_m)
    th_month_m = th_month_mean.get((tid, month), th_m)
    th_month_dow_m = th_month_dow_mean.get((tid, month, dow), th_dow_m)
    
    same_dow = same_dow_history.get(tid, {}).get(dow, {})
    mean_4w = same_dow.get('mean_4w', th_dow_m)
    
    return (
        0.35 * th_dow_m +
        0.25 * th_month_dow_m +
        0.20 * mean_4w +
        0.15 * th_month_m +
        0.05 * th_m
    )

def predict_balanced(tid, dow, month, quarter, is_weekend, is_month_start):
    th_m = th_mean.get(tid, glob_mean)
    th_dow_m = th_dow_mean.get((tid, dow), th_m)
    th_month_dow_m = th_month_dow_mean.get((tid, month, dow), th_dow_m)
    th_month_weekend_m = th_month_weekend_mean.get((tid, month, is_weekend), th_month_dow_m)
    th_quarter_dow_m = th_quarter_dow_mean.get((tid, quarter, dow), th_dow_m)
    
    same_dow = same_dow_history.get(tid, {}).get(dow, {})
    weighted_4w = same_dow.get('weighted_4w', th_dow_m)
    lag7 = same_dow.get('lag_7', th_dow_m)
    
    r14 = recent_14d.get(tid, th_m)
    r21 = recent_21d.get(tid, th_m)
    
    return (
        0.18 * th_dow_m +
        0.20 * weighted_4w +
        0.15 * th_month_dow_m +
        0.14 * lag7 +
        0.08 * th_month_weekend_m +
        0.10 * r14 +               
        0.05 * th_quarter_dow_m +   
        0.05 * r21 +
        0.05 * th_m
    )

def predict_aggressive(tid, dow, month, quarter, is_weekend, is_month_start):
    th_m = th_mean.get(tid, glob_mean)
    th_dow_m = th_dow_mean.get((tid, dow), th_m)
    
    same_dow = same_dow_history.get(tid, {}).get(dow, {})
    lag7 = same_dow.get('lag_7', th_dow_m)
    lag14 = same_dow.get('lag_14', th_dow_m)
    weighted_4w = same_dow.get('weighted_4w', th_dow_m)
    
    r7 = recent_7d.get(tid, th_m)
    r14 = recent_14d.get(tid, th_m)
    
    th_month_dow_m = th_month_dow_mean.get((tid, month, dow), th_dow_m)
    
    return (
        0.30 * lag7 +
        0.20 * weighted_4w +
        0.15 * r14 +
        0.12 * th_dow_m +
        0.10 * th_month_dow_m +
        0.08 * lag14 +
        0.05 * r7
    )

def predict_ensemble(tid, dow, month, quarter, is_weekend, is_month_start):
    p_cons = predict_conservative(tid, dow, month, quarter, is_weekend, is_month_start)
    p_bal = predict_balanced(tid, dow, month, quarter, is_weekend, is_month_start)
    p_agg = predict_aggressive(tid, dow, month, quarter, is_weekend, is_month_start)
    
    data_count = theater_data_count.get(tid, 0)
    
    if data_count < 30:
        weights = [0.6, 0.3, 0.1]
    elif data_count < 100:
        weights = [0.3, 0.5, 0.2]
    else:
        weights = [0.2, 0.4, 0.4]
    
    ensemble = weights[0] * p_cons + weights[1] * p_bal + weights[2] * p_agg
    
    return max(ensemble, 0)

In [12]:
# Predict on validation data
val_preds = []
val_acts = []

for _, row in tqdm(val_df.iterrows(), total=len(val_df), desc="Validation"):
    val_acts.append(row['audience_count'])
    pred = predict_ensemble(
        row['book_theater_id'],
        row['dow'],
        row['month'],
        row['quarter'],
        row['is_weekend'],
        row['is_month_start']
    )
    val_preds.append(pred)

val_r2 = r2_score(val_acts, val_preds)

print(f"Validation R² = {val_r2:.4f}")

Validation: 100%|██████████| 41075/41075 [00:01<00:00, 22564.72it/s]

Validation R² = 0.4888





In [13]:
# Preparing training features on full data
th_mean = visits.groupby('book_theater_id')['audience_count'].mean().to_dict()
th_median = visits.groupby('book_theater_id')['audience_count'].median().to_dict()
th_dow_mean = visits.groupby(['book_theater_id', 'dow'])['audience_count'].mean().to_dict()
th_dow_median = visits.groupby(['book_theater_id', 'dow'])['audience_count'].median().to_dict()
th_month_mean = visits.groupby(['book_theater_id', 'month'])['audience_count'].mean().to_dict()
th_month_median = visits.groupby(['book_theater_id', 'month'])['audience_count'].median().to_dict()
th_quarter_mean = visits.groupby(['book_theater_id', 'quarter'])['audience_count'].mean().to_dict()
th_weekend_mean = visits.groupby(['book_theater_id', 'is_weekend'])['audience_count'].mean().to_dict()
th_month_dow_mean = visits.groupby(['book_theater_id', 'month', 'dow'])['audience_count'].mean().to_dict()
th_month_weekend_mean = visits.groupby(['book_theater_id', 'month', 'is_weekend'])['audience_count'].mean().to_dict()
th_quarter_dow_mean = visits.groupby(['book_theater_id', 'quarter', 'dow'])['audience_count'].mean().to_dict()
th_month_start_mean = visits.groupby(['book_theater_id', 'is_month_start'])['audience_count'].mean().to_dict()

glob_mean = visits['audience_count'].mean()
glob_dow_mean = visits.groupby('dow')['audience_count'].mean().to_dict()
glob_month_mean = visits.groupby('month')['audience_count'].mean().to_dict()
glob_month_dow_mean = visits.groupby(['month', 'dow'])['audience_count'].mean().to_dict()
glob_quarter_mean = visits.groupby('quarter')['audience_count'].mean().to_dict()
glob_quarter_dow_mean = visits.groupby(['quarter', 'dow'])['audience_count'].mean().to_dict()

In [14]:
same_dow_history = {}
for tid in visits['book_theater_id'].unique():
    th_data = visits[visits['book_theater_id'] == tid].copy()
    same_dow_history[tid] = {}
    
    for dow in range(7):
        dow_data = th_data[th_data['dow'] == dow].sort_values('show_date')
        
        if len(dow_data) >= 4:
            vals = dow_data.tail(4)['audience_count'].values
            weights_arr = np.array([0.4, 0.3, 0.2, 0.1])
            weighted_avg = np.sum(vals * weights_arr)
            
            same_dow_history[tid][dow] = {
                'lag_7': vals[-1], 
                'lag_14': vals[-2] if len(vals) >= 2 else vals[-1],
                'lag_21': vals[-3] if len(vals) >= 3 else vals[-1], 
                'lag_28': vals[-4] if len(vals) >= 4 else vals[-1],
                'mean_4w': np.mean(vals), 
                'median_4w': np.median(vals), 
                'weighted_4w': weighted_avg, 
                'std_4w': np.std(vals) if len(vals) > 1 else 0
            }
        elif len(dow_data) > 0:
            baseline = dow_data['audience_count'].mean()
            same_dow_history[tid][dow] = {
                'lag_7': baseline, 
                'lag_14': baseline, 
                'lag_21': baseline, 
                'lag_28': baseline,
                'mean_4w': baseline, 
                'median_4w': baseline, 
                'weighted_4w': baseline, 
                'std_4w': 0
            }

recent_7d = {}
recent_14d = {}
recent_21d = {}
recent_28d = {}
for tid in visits['book_theater_id'].unique():
    th_data = visits[visits['book_theater_id'] == tid].sort_values('show_date')
    baseline = th_mean.get(tid, glob_mean)
    recent_7d[tid] = th_data.tail(7)['audience_count'].mean() if len(th_data) >= 7 else baseline
    recent_14d[tid] = th_data.tail(14)['audience_count'].mean() if len(th_data) >= 14 else baseline
    recent_21d[tid] = th_data.tail(21)['audience_count'].mean() if len(th_data) >= 21 else baseline
    recent_28d[tid] = th_data.tail(28)['audience_count'].mean() if len(th_data) >= 28 else baseline

theater_data_count = visits.groupby('book_theater_id').size().to_dict()

In [15]:
# Formatting test dataset
test_df = sample_submission.copy()
test_df[['book_theater_id', 'show_date']] = test_df['ID'].str.rsplit('_', n=1, expand=True)
test_df['show_date'] = pd.to_datetime(test_df['show_date'])

# Preparing test features
test_df['dow'] = test_df['show_date'].dt.dayofweek
test_df['month'] = test_df['show_date'].dt.month
test_df['quarter'] = test_df['show_date'].dt.quarter
test_df['is_weekend'] = test_df['dow'].isin([5, 6]).astype(int)
test_df['is_month_start'] = test_df['show_date'].dt.day <= 7

In [16]:
# Test prediction
test_df['pred'] = test_df.apply(
    lambda row: predict_ensemble(
        row['book_theater_id'],
        row['dow'],
        row['month'],
        row['quarter'],
        row['is_weekend'],
        row['is_month_start']
    ),
    axis=1
)

# Smoothening last 2 days
smoothed = []
for tid in tqdm(test_df['book_theater_id'].unique(), desc="Smoothing"):
    tdf = test_df[test_df['book_theater_id'] == tid].sort_values('show_date').copy()
    tdf['pred'] = tdf['pred'].rolling(2, min_periods=1, center=False).mean()
    smoothed.append(tdf)

# Submission
test_df = pd.concat(smoothed, ignore_index=True)
test_df['pred'] = test_df['pred'].clip(lower=0)

submission = test_df[['ID', 'pred']].copy()
submission.columns = ['ID', 'audience_count']
submission.to_csv('submission.csv', index=False)

print('Submission saved!')

Smoothing: 100%|██████████| 827/827 [00:02<00:00, 279.42it/s]


Submission saved!
