# Initial Analysis - CMU Movie Corpus Dataset

In [31]:
import pandas as pd
import numpy as np
import statsmodels as sts
import matplotlib.pyplot as plt
from textblob import TextBlob
import nltk
from concurrent.futures import ThreadPoolExecutor, as_completed
import seaborn as sns

In [32]:
plot_summaries_path = './Data/plot_summaries.txt'
movie_metadata_path = './Data/movie.metadata.tsv'
character_metadata_path = './Data/character.metadata.tsv'
tv_tropes_path = './Data/tvtropes.clusters.txt'
name_clusters_path = './Data/name.clusters.txt'

In [34]:
plot_summaries = pd.read_csv(plot_summaries_path, sep='\t', header=None, names=['movie_wikipedia_id', 'plot_summary'])
movie_metadata = pd.read_csv(movie_metadata_path, sep='\t', header=None, names=['movie_wikipedia_id', 'movie_freebase_id', 'movie_name', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres'])
character_metadata = pd.read_csv(character_metadata_path, sep='\t', header=None, names=['movie_wikipedia_id', 'movie_freebase_id', 'movie_release_date', 'character_name', 'actor_dob', 'actor_gender', 'actor_height', 'actor_ethnicity','actor_name','actor_age', 'character_freebase_map', 'freebase_character_id', 'freebase_actor_id'])
tv_tropes = pd.read_csv(tv_tropes_path, sep='\t', header=None, names=['trope', 'data'])
name_clusters = pd.read_csv(name_clusters_path, sep='\t', header=None, names=['character_name', 'id'])

https://www.kaggle.com/datasets/eliasdabbas/boxofficemojo-alltime-domestic-data

In [36]:
box_off = pd.read_csv('Data/boxoffice.csv')

## Merged revenue info

In [66]:
box_off

Unnamed: 0,rank,title,studio,lifetime_gross,year
0,1,Star Wars: The Force Awakens,BV,936662225,2015
1,2,Avatar,Fox,760507625,2009
2,3,Black Panther,BV,700059566,2018
3,4,Avengers: Infinity War,BV,678681680,2018
4,5,Titanic,Par.,659363944,1997
...,...,...,...,...,...
15738,15739,Dog Eat Dog,IFC,80,2009
15739,15740,Paranoid Girls,,78,2015
15740,15741,Confession of a Child of the Century,Cohen,74,2015
15741,15742,Storage 24,Magn.,72,2013


In [70]:
movie_metadata.shape

(81741, 9)

In [68]:
mrgd = pd.merge(movie_metadata, box_off, right_on='title', left_on='movie_name', how='left')

In [71]:
mrgd['final_rev'] = np.where(mrgd['revenue'].notna(), mrgd['revenue'], mrgd['lifetime_gross'])

In [72]:
mrgd['final_rev'].notna().mean()

0.17685146724656095

In [73]:
movie_metadata['revenue'].notna().mean()

0.1027758407653442

## Add budgets info

In [34]:
bdgts = pd.read_csv("Data/movies_metadata.csv")

  bdgts = pd.read_csv("Data/movies_metadata.csv")


In [35]:
bdgts = bdgts[['budget', 'revenue', 'original_title']]

In [37]:
bdgts.shape

(45466, 3)

In [38]:
bdgts["budget"] = pd.to_numeric(bdgts['budget'], errors='coerce')

In [39]:
bdgts.dropna(subset=['budget'], inplace=True)

In [40]:
bdgts.shape

(45463, 3)

In [41]:
bdgts["budget"].value_counts()

0.0            36573
5000000.0        286
10000000.0       259
20000000.0       243
2000000.0        242
               ...  
270000000.0        1
923.0              1
72500000.0         1
2160000.0          1
1254040.0          1
Name: budget, Length: 1223, dtype: int64

In [42]:
movie_metadata.head()

Unnamed: 0,movie_wikipedia_id,movie_freebase_id,movie_name,release_date,revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [55]:
mrgd = pd.merge(movie_metadata, bdgts, left_on = 'movie_name', right_on = 'original_title', how='left') 

In [56]:
movie_metadata[]['movie_name'].nunique()

75478

In [57]:
movie_metadata[movie_metadata['revenue'] > 0]['movie_name'].nunique()

8207

In [58]:
mrgd[mrgd['budget'] > 0]['movie_name'].nunique()

5755

In [59]:
mrgd['final_rev'] = np.where(mrgd['revenue_x'].notna(), mrgd['revenue_x'], mrgd['revenue_y'])

In [60]:
mrgd[mrgd['final_rev'] > 0]['movie_name'].nunique()

9473