In [None]:
# AI chat in this link: https://gemini.google.com/share/eb121f0244a9

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss

# 1. Load Data
df = pd.read_csv('Final_best_picture_data.csv')

# --- Feature Engineering ---

# Clean Runtime: "141 min" -> 141 (int)
df['Runtime_clean'] = df['Runtime'].astype(str).str.replace(' min', '', regex=False)
df['Runtime_clean'] = pd.to_numeric(df['Runtime_clean'], errors='coerce')

# Convert Booleans to Integers (0/1)
bool_cols = ['Nominated_Both_Director_and_Picture', 'Golden_Globe_Picture_Winner', 'pga_winner']
for col in bool_cols:
    df[col] = df[col].astype(int)

# Process Genre: One-Hot Encoding for Multi-label strings
# e.g., "Biography, Drama" -> is_Biography=1, is_Drama=1
# Step 1: Create a list of all genres
all_genres = set()
for genres in df['Genre'].dropna():
    parts = [g.strip() for g in genres.split(',')]
    all_genres.update(parts)

# Step 2: Create columns for each genre
for g in all_genres:
    # Clean genre name for column header (remove spaces/special chars if needed)
    col_name = f"genre_{g.replace(' ', '_').replace('-', '_')}"
    df[col_name] = df['Genre'].apply(lambda x: 1 if isinstance(x, str) and g in [p.strip() for p in x.split(',')] else 0)

# Define Feature Columns
feature_cols = ['Runtime_clean', 'Meta_score'] + bool_cols + [c for c in df.columns if c.startswith('genre_')]

# Fill missing values if any (simple mean imputation for numeric, though inspection showed clean data)
df['Meta_score'] = df['Meta_score'].fillna(df['Meta_score'].mean())
df['Runtime_clean'] = df['Runtime_clean'].fillna(df['Runtime_clean'].mean())

# --- Train / Test Split ---
# Requirement: Train until year_ceremony == 2020 (included)
train_df = df[df['year_ceremony'] <= 2020].copy()
test_df = df[df['year_ceremony'] >= 2021].copy()

# Prepare X and y
X_train = train_df[feature_cols]
y_train = train_df['winner'].astype(int)

X_test = test_df[feature_cols]
y_test = test_df['winner'].astype(int)

# --- MinMaxScaler ---
# Scale numerical features. (Tree-based models don't strictly require this, but user requested it)
scaler = MinMaxScaler()
# We fit on training data and transform both
scale_cols = ['Runtime_clean', 'Meta_score']
X_train[scale_cols] = scaler.fit_transform(X_train[scale_cols])
X_test[scale_cols] = scaler.transform(X_test[scale_cols])

# --- Ensemble Learning Model ---
# Using Random Forest as the Ensemble method
rf_model = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=5) # max_depth is chosen 5 to ensure the model focuses on the most powerful winning signals, like the PGA award
rf_model.fit(X_train, y_train)

# --- Probability Ranking & Evaluation ---

# Predict Probabilities (Class 1 = Winner)
# predict_proba returns [prob_class_0, prob_class_1]
probs_train = rf_model.predict_proba(X_train)[:, 1]
probs_test = rf_model.predict_proba(X_test)[:, 1]

# Assign probabilities back to the DataFrame for viewing
test_df['predicted_prob'] = probs_test

# Evaluation Loop: Rank films by probability for each ceremony year
print("--- Prediction Results (2021-Present) ---")
print("Ranking films by predicted probability of winning Best Picture:\n")

unique_years = sorted(test_df['year_ceremony'].unique())

for year in unique_years:
    print(f"=== Ceremony Year {year} ===")

    # Get the slice for this year
    year_data = test_df[test_df['year_ceremony'] == year].copy()

    # Sort by predicted probability (Descending)
    year_data = year_data.sort_values(by='predicted_prob', ascending=False)

    # Create a rank column
    year_data['Rank'] = range(1, len(year_data) + 1)

    # Display the top results
    display_cols = ['Rank', 'film', 'predicted_prob', 'winner']
    print(year_data[display_cols].to_string(index=False))

    # Check if the winner was correctly predicted (Rank 1)
    actual_winner = year_data[year_data['winner'] == True]
    if not actual_winner.empty:
        winner_rank = actual_winner.iloc[0]['Rank']
        winner_name = actual_winner.iloc[0]['film']
        print(f"\n-> Actual Winner: {winner_name} (Ranked #{winner_rank} by model)")
    else:
        print("\n-> Actual winner data not found or no winner in list.")

    print("-" * 40 + "\n")

# Optional: Feature Importance
print("--- Feature Importance ---")
importances = pd.Series(rf_model.feature_importances_, index=feature_cols).sort_values(ascending=False)
print(importances.head(10))

--- Prediction Results (2021-Present) ---
Ranking films by predicted probability of winning Best Picture:

=== Ceremony Year 2021 ===
 Rank                        film  predicted_prob  winner
    1                   Nomadland        0.522697    True
    2       Promising Young Woman        0.157883   False
    3                      Minari        0.097470   False
    4                        Mank        0.081800   False
    5 Judas and the Black Messiah        0.059278   False
    6  The Trial of the Chicago 7        0.039415   False
    7                  The Father        0.029289   False
    8              Sound of Metal        0.023248   False

-> Actual Winner: Nomadland (Ranked #1 by model)
----------------------------------------

=== Ceremony Year 2022 ===
 Rank                 film  predicted_prob  winner
    1 The Power of the Dog        0.354077   False
    2      West Side Story        0.224625   False
    3                 CODA        0.205117    True
    4         Drive M