# Initial Analysis - CMU Movie Corpus Dataset

In [31]:
import pandas as pd
import numpy as np
import statsmodels as sts
import matplotlib.pyplot as plt
from textblob import TextBlob
import nltk
from concurrent.futures import ThreadPoolExecutor, as_completed
import seaborn as sns

In [32]:
plot_summaries_path = './Data/plot_summaries.txt'
movie_metadata_path = './Data/movie.metadata.tsv'
character_metadata_path = './Data/character.metadata.tsv'
tv_tropes_path = './Data/tvtropes.clusters.txt'
name_clusters_path = './Data/name.clusters.txt'

In [63]:
movie_metadata = pd.read_csv(
    movie_metadata_path, sep='\t', header=None, 
    names=[
        'movie_wikipedia_id', 'movie_freebase_id', 'movie_name', 
        'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres'
    ]
)

plot_summaries = pd.read_csv(
    plot_summaries_path, sep='\t', header=None, names=['movie_wikipedia_id', 'plot_summary'])

In [65]:
cmu_merged = pd.merge(movie_metadata, plot_summaries, on='movie_wikipedia_id')

https://www.kaggle.com/datasets/eliasdabbas/boxofficemojo-alltime-domestic-data

In [66]:
box_off = pd.read_csv('Data/boxoffice.csv')

## Merged revenue info

In [67]:
box_off

Unnamed: 0,rank,title,studio,lifetime_gross,year
0,1,Star Wars: The Force Awakens,BV,936662225,2015
1,2,Avatar,Fox,760507625,2009
2,3,Black Panther,BV,700059566,2018
3,4,Avengers: Infinity War,BV,678681680,2018
4,5,Titanic,Par.,659363944,1997
...,...,...,...,...,...
15738,15739,Dog Eat Dog,IFC,80,2009
15739,15740,Paranoid Girls,,78,2015
15740,15741,Confession of a Child of the Century,Cohen,74,2015
15741,15742,Storage 24,Magn.,72,2013


In [68]:
cmu_merged.shape

(42204, 10)

In [69]:
mrgd = pd.merge(cmu_merged, box_off, right_on='title', left_on='movie_name', how='left')

In [70]:
mrgd['final_rev'] = np.where(mrgd['revenue'].notna(), mrgd['revenue'], mrgd['lifetime_gross'])

In [71]:
mrgd['final_rev'].notna().mean()

0.267571658310472

In [72]:
movie_metadata['revenue'].notna().mean()

0.1027758407653442

## Add budgets info

In [73]:
bdgts = pd.read_csv("Data/movies_metadata.csv")

  bdgts = pd.read_csv("Data/movies_metadata.csv")


In [74]:
bdgts = bdgts[['budget', 'revenue', 'original_title']]

In [75]:
bdgts.shape

(45466, 3)

In [76]:
bdgts["budget"] = pd.to_numeric(bdgts['budget'], errors='coerce')

In [77]:
bdgts.dropna(subset=['budget'], inplace=True)

In [78]:
bdgts.shape

(45463, 3)

In [79]:
bdgts["budget"].value_counts()

0.0            36573
5000000.0        286
10000000.0       259
20000000.0       243
2000000.0        242
               ...  
270000000.0        1
923.0              1
72500000.0         1
2160000.0          1
1254040.0          1
Name: budget, Length: 1223, dtype: int64

In [80]:
cmu_merged.head()

Unnamed: 0,movie_wikipedia_id,movie_freebase_id,movie_name,release_date,revenue,runtime,languages,countries,genres,plot_summary
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","Set in the second half of the 22nd century, th..."
1,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",A series of murders of rich young women throug...
2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}","Eva, an upper class housewife, becomes frustra..."
3,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...","Every hundred years, the evil Morgana returns..."
4,6631279,/m/0gffwj,Little city,1997-04-04,,93.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...","Adam, a San Francisco-based artist who works a..."


In [81]:
mrgd = pd.merge(cmu_merged, bdgts, left_on = 'movie_name', right_on = 'original_title', how='left') 

In [82]:
cmu_merged['movie_name'].nunique()

39914

In [88]:
cmu_merged[cmu_merged['revenue'] > 0]['movie_name'].nunique()

7423

In [89]:
mrgd[mrgd['budget'] > 0]['movie_name'].nunique()

5216

In [90]:
mrgd['final_rev'] = np.where(mrgd['revenue_x'].notna(), mrgd['revenue_x'], mrgd['revenue_y'])

In [91]:
mrgd[mrgd['final_rev'] > 0]['movie_name'].nunique()

8428