In [1]:
import pandas as pd
import numpy as np
import statsmodels as sts
import matplotlib.pyplot as plt
from textblob import TextBlob
import nltk
from concurrent.futures import ThreadPoolExecutor, as_completed
import seaborn as sns
import os

In [2]:
movie_metadata_path = './Data/movie.metadata.tsv'
plot_summaries_path = './Data/plot_summaries.txt'

In [3]:
movie_metadata = pd.read_csv(
    movie_metadata_path, sep='\t', header=None, 
    names=[
        'movie_wikipedia_id', 'movie_freebase_id', 'movie_name', 
        'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres'
    ]
)

plot_summaries = pd.read_csv(
    plot_summaries_path, sep='\t', header=None, names=['movie_wikipedia_id', 'plot_summary'])

In [4]:
cmu_merged = pd.merge(movie_metadata, plot_summaries, on='movie_wikipedia_id')

In [5]:
len(cmu_merged['movie_name'].unique())

39914

# Reviews

The most overlap with CMU movies provides IMDB dataset from their website https://developer.imdb.com/non-commercial-datasets/

Let's download and unpack corresponding files (they are ~300 MB)

In [6]:
if not os.path.isdir('imdb'):
    os.mkdir('imdb')

In [7]:
%%capture
!wget -P imdb https://datasets.imdbws.com/title.basics.tsv.gz
!wget -P imdb https://datasets.imdbws.com/title.ratings.tsv.gz

!gunzip imdb/title.basics.tsv.gz
!gunzip imdb/title.ratings.tsv.gz

In [8]:
imdb_title_basics = pd.read_csv('imdb/title.basics.tsv', sep='\t')
imdb_title_ratings = pd.read_csv('imdb/title.ratings.tsv', sep='\t')

  imdb_title_basics = pd.read_csv('imdb/title.basics.tsv', sep='\t')


In [9]:
imdb_title_basics = imdb_title_basics[['tconst', 'primaryTitle']]

In [10]:
imdb_merged = pd.merge(imdb_title_ratings, imdb_title_basics, on='tconst')
imdb_merged = imdb_merged.drop(columns=['tconst']).drop_duplicates(subset=['primaryTitle'])

In [11]:
cmu_with_reviews = pd.merge(imdb_merged, cmu_merged, left_on='primaryTitle', right_on='movie_name', how='right')
cmu_with_reviews = cmu_with_reviews.drop(columns=['primaryTitle'])

In [12]:
notna_rating_frac = cmu_with_reviews['averageRating'].notna().mean()
print(f'We have average IMDB rating for {notna_rating_frac * 100:.0f}% of films')

We have average IMDB rating for 82% of films


# Budgets & Revenue

In [13]:
bdgts = pd.read_csv("movies_metadata.csv")
bdgts = bdgts[['budget', 'revenue', 'original_title']]

bdgts["budget"] = pd.to_numeric(bdgts['budget'], errors='coerce')
bdgts.dropna(subset=['budget'], inplace=True)

  bdgts = pd.read_csv("movies_metadata.csv")


In [14]:
cmu_with_budgets = pd.merge(
    cmu_with_reviews, bdgts, left_on='movie_name', right_on='original_title', how='left') 

cmu_with_budgets['revenue'] = np.where(
    cmu_with_budgets['revenue_x'].notna(), 
    cmu_with_budgets['revenue_x'], 
    cmu_with_budgets['revenue_y']
)

cmu_with_budgets.drop(columns=['revenue_x', 'revenue_y'], inplace=True)

In [15]:
notna_revenue_frac = cmu_with_budgets['revenue'].notna().mean()
print(f'We have average IMDB rating for {notna_revenue_frac * 100:.0f}% of films')

We have average IMDB rating for 49% of films


In [16]:
notna_budget_frac = cmu_with_budgets['budget'].notna().mean()
print(f'We have average IMDB rating for {notna_budget_frac * 100:.0f}% of films')

We have average IMDB rating for 46% of films
