In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, classification_report,
                             confusion_matrix)

In [6]:
df = pd.read_csv('data/movie_metadata.csv')
print(f"Original dataset shape: {df.shape}")

Original dataset shape: (5043, 28)


In [24]:
df = df.dropna(subset=['gross'])

In [None]:
# creating target variable 

df['blockbuster'] = (df['gross'] >= 100000000).astype(int)
print(f"Blockbuster distribution:\n{df['blockbuster'].value_counts()}")

Blockbuster distribution:
blockbuster
0    3554
1     605
Name: count, dtype: int64


In [26]:
df = df.drop(columns=['gross'])

In [None]:
# creating theme_text feature 

df['plot_keywords'] = df['plot_keywords'].fillna('')
df['genres'] = df['genres'].fillna('')
df['theme_text'] = (df['plot_keywords'] + ' ' + df['genres']).str.replace('|', ' ', regex=False)

In [None]:
# defining features

numeric_features = ['budget', 'num_critic_for_reviews', 'duration', 'title_year', 'imdb_score']
categorical_features = ['country']
text_features = ['theme_text']

In [29]:
feature_columns = numeric_features + categorical_features + text_features + ['blockbuster']
df_model = df[feature_columns].copy()

In [30]:
print(f"Final dataset shape: {df_model.shape}")

Final dataset shape: (4159, 8)


In [31]:
X = df_model.drop(columns=['blockbuster'])
y = df_model['blockbuster']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

Training set: (3327, 7), Test set: (832, 7)


In [None]:
# building preprocessing pipelines 

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=5))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features),
    ('text', text_pipeline, 'theme_text')
])

In [33]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100)
}

In [None]:
# training and evaluating 

results = []

for model_name, model in models.items():
    print(f"\nTraining {model_name}")

    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    full_pipeline.fit(X_train, y_train)

    y_pred = full_pipeline.predict(X_test)
    y_pred_proba = full_pipeline.predict_proba(X_test)[:, 1]

    # metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    results.append({
        'Model': model_name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1,
        'ROC-AUC': auc,
        'Pipeline': full_pipeline
    })

    print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}, ROC-AUC: {auc:.4f}")


Training Logistic Regression


Accuracy: 0.8882, Precision: 0.7188, Recall: 0.3802, F1: 0.4973, ROC-AUC: 0.8911

Training Random Forest
Accuracy: 0.8966, Precision: 0.8431, Recall: 0.3554, F1: 0.5000, ROC-AUC: 0.8739

Training Gradient Boosting
Accuracy: 0.8990, Precision: 0.7467, Recall: 0.4628, F1: 0.5714, ROC-AUC: 0.9189


In [None]:
# comparing models

print("MODEL COMPARISON (Sorted by ROC-AUC)")

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('ROC-AUC', ascending=False)

print(results_df[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']].to_string(index=False))

MODEL COMPARISON (Sorted by ROC-AUC)
              Model  Accuracy  Precision   Recall  F1-Score  ROC-AUC
  Gradient Boosting  0.899038   0.746667 0.462810  0.571429 0.918919
Logistic Regression  0.888221   0.718750 0.380165  0.497297 0.891051
      Random Forest  0.896635   0.843137 0.355372  0.500000 0.873918


In [None]:
# best model evaluation 

best_model_row = results_df.iloc[0]
best_model_name = best_model_row['Model']
best_pipeline = best_model_row['Pipeline']

print(f"BEST MODEL: {best_model_name} (ROC-AUC: {best_model_row['ROC-AUC']:.4f})")

y_pred_best = best_pipeline.predict(X_test)

print("\nClassification report:")
print(classification_report(y_test, y_pred_best, target_names=['Not blockbuster', 'Blockbuster']))

print("\nConfusion matrix:")
cm = confusion_matrix(y_test, y_pred_best)
print(f"                  Predicted")
print(f"                  Not BB    Blockbuster")
print(f"Actual not BB     {cm[0,0]:<10} {cm[0,1]:<10}")
print(f"Actual blockbuster {cm[1,0]:<10} {cm[1,1]:<10}")

BEST MODEL: Gradient Boosting (ROC-AUC: 0.9189)

Classification report:
                 precision    recall  f1-score   support

Not blockbuster       0.91      0.97      0.94       711
    Blockbuster       0.75      0.46      0.57       121

       accuracy                           0.90       832
      macro avg       0.83      0.72      0.76       832
   weighted avg       0.89      0.90      0.89       832


Confusion matrix:
                  Predicted
                  Not BB    Blockbuster
Actual not BB     692        19        
Actual blockbuster 65         56        
