In [2]:
import pandas as pd

# Notebook to create a meaningful movie data set
Two data sources
- the **IMDb data sets** to download [link](https://datasets.imdbws.com/) (not stored on github, to run the notebook -> download relevant data). Contains many info ratings etc but not box-office and in a not convenient way (SQL storage)
- **boxoffice stats**, for now from [link](https://www.boxofficemojo.com/year/2015/)

**Goal**: for each 2015-2020 US movie get ratings, boxoffice, director names, main actor names... 

## Processing the IMDb data sets
The data set is organized in a SQL way, each table being a compressed tsv file. We are going to filter the data and merge files to get in one csv file all the info needed anbout a movie.
### Filter relevant years only

In [64]:
df1 = pd.read_csv('./data/title.basics.tsv.gz', compression='gzip', header=0, sep='\t', na_values='\\N')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [65]:
df1[df1["primaryTitle"].str.contains("Star Wars: Episode V",na=False)]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
78976,tt0080684,movie,Star Wars: Episode V - The Empire Strikes Back,Star Wars: Episode V - The Empire Strikes Back,0.0,1980.0,,124.0,"Action,Adventure,Fantasy"
84322,tt0086190,movie,Star Wars: Episode VI - Return of the Jedi,Star Wars: Episode VI - Return of the Jedi,0.0,1983.0,,131.0,"Action,Adventure,Fantasy"
1318730,tt10618318,tvEpisode,Star Wars: Episode V - The Empire Strikes Back,Star Wars: Episode V - The Empire Strikes Back,0.0,2019.0,,,Comedy
1326464,tt10631820,tvEpisode,Star Wars: Episode VI - Return of the Jedi,Star Wars: Episode VI - Return of the Jedi,0.0,2019.0,,,Comedy
1486139,tt10910686,tvEpisode,Star Wars: Episode V - The Empire Strikes Back,Star Wars: Episode V - The Empire Strikes Back,0.0,2019.0,,127.0,Talk-Show
1515793,tt10962828,tvEpisode,Everything GREAT About Star Wars: Episode VII ...,Everything GREAT About Star Wars: Episode VII ...,0.0,2016.0,,15.0,Comedy
1535270,tt10996576,tvEpisode,Star Wars: Episode VI - Return of the Jedi,Star Wars: Episode VI - Return of the Jedi,0.0,2019.0,,133.0,Talk-Show
1603022,tt11114174,tvEpisode,RICK REACTS ~ Star Wars: Episode VII - The For...,RICK REACTS ~ Star Wars: Episode VII - The For...,0.0,2019.0,,70.0,Reality-TV
1685186,tt11261064,tvEpisode,Everything GREAT About Star Wars: Episode V - ...,Everything GREAT About Star Wars: Episode V - ...,0.0,2017.0,,22.0,Comedy
1685193,tt11261078,tvEpisode,Everything GREAT About Star Wars: Episode VI -...,Everything GREAT About Star Wars: Episode VI -...,0.0,2017.0,,24.0,Comedy


In [66]:
# keep only movies and start year in [2014;2021], one year margin
df1 = df1[(df1['startYear']>=2014)&(df1['startYear']<=2021)&(df1['titleType']=="movie")]
df1.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11059,tt0011216,movie,Spanish Fiesta,La fête espagnole,0.0,2019.0,,67.0,Drama
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0.0,2019.0,,,"Action,Crime"
16657,tt0016906,movie,Frivolinas,Frivolinas,0.0,2014.0,,80.0,"Comedy,Musical"
61124,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0.0,2020.0,,70.0,Drama
64098,tt0065392,movie,Bucharest Memories,Amintiri bucurestene,0.0,2020.0,,,Documentary


In [67]:
df1[df1["primaryTitle"].str.contains("Star Wars: Episode V",na=False)]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
4993277,tt2488496,movie,Star Wars: Episode VII - The Force Awakens,Star Wars: Episode VII - The Force Awakens,0.0,2015.0,,138.0,"Action,Adventure,Sci-Fi"
5008541,tt2527336,movie,Star Wars: Episode VIII - The Last Jedi,Star Wars: Episode VIII - The Last Jedi,0.0,2017.0,,152.0,"Action,Adventure,Fantasy"


In [68]:
# get titles table
df = pd.read_csv('./data/title.akas.tsv.gz', compression='gzip', header=0, sep='\t', na_values='\\N')
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0.0
1,tt0000001,2,Carmencita,DE,,,literal title,0.0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0.0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0.0


In [69]:
# Select only US and UK
df2 = df[(df["region"]=="US")|(df["region"]=="UK")]

In [70]:
# Merge
df_data = pd.merge(df1,df2,how="inner",left_on="tconst",right_on="titleId")
df_data.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0.0,2020.0,,70.0,Drama,tt0062336,5,The Tango of the Widower and Its Distorting Mi...,US,,imdbDisplay,,0.0
1,tt0065392,movie,Bucharest Memories,Amintiri bucurestene,0.0,2020.0,,,Documentary,tt0065392,3,Bucharest Memories,US,,imdbDisplay,,0.0
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0.0,2018.0,,122.0,Drama,tt0069049,3,The Other Side of the Wind,US,,imdbDisplay,,0.0
3,tt0100275,movie,The Wandering Soap Opera,La telenovela errante,0.0,2017.0,,80.0,"Comedy,Drama,Fantasy",tt0100275,6,The Wandering Soap Opera,US,,imdbDisplay,,0.0
4,tt0112502,movie,Bigfoot,Bigfoot,0.0,2017.0,,,"Horror,Thriller",tt0112502,3,Bigfoot,US,,imdbDisplay,,0.0


## Box-office data from the web

In [71]:
# scrap boxoffice
for i in range(2015,2021):
    df_box_t = pd.read_html(f"https://www.boxofficemojo.com/year/{i}/")[0]
    df_box_t["Year"] = i
    if i == 2015:
        df_box = df_box_t
    else:
        df_box = pd.concat((df_box,df_box_t))
    
print(df_box.shape)
df_box.head()

(1200, 12)


Unnamed: 0,Rank,Release,Genre,Budget,Running Time,Gross,Theaters,Total Gross,Release Date,Distributor,Estimated,Year
0,1,Jurassic World,-,-,-,"$652,270,625",4291,"$652,270,625",Jun 12,Universal Pictures,False,2015
1,2,Star Wars: Episode VII - The Force Awakens,-,-,-,"$651,967,269",4134,"$936,662,225",Dec 18,Walt Disney Studios Motion Pictures,False,2015
2,3,Avengers: Age of Ultron,-,-,-,"$459,005,868",4276,"$459,005,868",May 1,Walt Disney Studios Motion Pictures,False,2015
3,4,Inside Out,-,-,-,"$356,461,711",4158,"$356,461,711",Jun 19,Walt Disney Studios Motion Pictures,False,2015
4,5,Furious 7,-,-,-,"$353,007,020",4022,"$353,007,020",Apr 3,Universal Pictures,False,2015


In [74]:
#Unmatched movies
df_box[~df_box["Release"].isin(df_data["primaryTitle"])]

Unnamed: 0,Rank,Release,Genre,Budget,Running Time,Gross,Theaters,Total Gross,Release Date,Distributor,Estimated,Year
150,151,The Green Inferno,-,-,-,"$7,192,291",1543,"$7,192,291",Sep 25,BH Tilt,False,2015
170,171,Hubble 3D,-,-,-,"$4,449,681",151,"$52,522,904",Mar 19,Warner Bros.,False,2015
198,199,The Oscar Nominated Short Films 2015: Live Action,-,-,-,"$2,412,593",280,"$2,412,593",Jan 30,Shorts International,False,2015
153,154,The Lobster2016 Re-release,-,-,-,"$8,700,374",560,"$8,700,374",May 13,A24,False,2016
173,174,The Meddler2016 Re-release,-,-,-,"$4,367,218",464,"$4,267,218",Apr 22,Sony Pictures Classics,False,2016
180,181,Kabali,-,-,-,"$3,903,095",236,"$3,903,095",Jul 21,-,False,2016
100,101,Leap!2017 Re-release,-,-,-,"$21,858,070",2705,"$21,858,070",Aug 25,The Weinstein Company,False,2017
185,186,Close Encounters of the Third Kind2017 Re-release,-,-,-,"$3,100,479",901,"$3,100,479",Sep 1,Sony Pictures Entertainment (SPE),False,2017
189,190,The Oscar Nominated Short Films 2017: Live Action,-,-,-,"$2,835,355",272,"$2,835,355",Feb 10,Shorts International,False,2017
195,196,Showtime Championship Boxing: Floyd Mayweather...,-,-,-,"$2,620,183",532,"$2,620,183",Aug 26,Fathom Events,False,2017


In [75]:
# merge boxoffice data with IMDb
df4 = pd.merge(df_data,df_box,how="outer",left_on="primaryTitle",right_on="Release")
# remove duplicates: only keep first for now
df4 = df4.groupby("Release").agg(lambda x: x.iloc[0])
print(df4.shape)
df4.head()

(1090, 28)


Unnamed: 0_level_0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,titleId,...,Genre,Budget,Running Time,Gross,Theaters,Total Gross,Release Date,Distributor,Estimated,Year
Release,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Cloverfield Lane,tt1179933,movie,10 Cloverfield Lane,10 Cloverfield Lane,0.0,2016.0,,103.0,"Action,Drama,Horror",tt1179933,...,-,-,-,"$72,082,998",3427,"$72,082,998",Mar 11,Paramount Pictures,False,2016.0
100% Wolf,tt8774798,movie,100% Wolf,100% Wolf,0.0,2020.0,,96.0,"Adventure,Animation,Comedy",tt8774798,...,-,-,-,"$514,957",124,"$658,764",Oct 9,Viva Pictures,False,2020.0
12 Strong,tt1413492,movie,12 Strong,12 Strong,0.0,2018.0,,130.0,"Action,Drama,History",tt1413492,...,-,-,-,"$45,819,713",3018,"$45,819,713",Jan 19,Warner Bros.,False,2018.0
13 Hours,tt4172430,movie,13 Hours,13 Hours,0.0,2016.0,,144.0,"Action,Drama,History",tt4172430,...,-,-,-,"$52,853,219",2917,"$52,853,219",Jan 15,Paramount Pictures,False,2016.0
1917,tt8579674,movie,1917,1917,0.0,2019.0,,119.0,"Action,Drama,War",tt8579674,...,-,-,-,"$157,901,466",3987,"$159,227,644",Dec 25,Universal Pictures,False,2020.0


## Back to IMDb to names associated with the movie (director and main actors names)

In [76]:
#df_crew = pd.read_csv('./data/title.crew.tsv.gz', compression='gzip', header=0, sep='\t', na_values='\\N')
#df_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,
3,tt0000004,nm0721526,
4,tt0000005,nm0005690,


### Get list of people linked to the movie

In [78]:
df_actors = pd.read_csv('./data/title.principals.tsv.gz', compression='gzip', header=0, sep='\t', na_values='\\N')
df_actors.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Self""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0374658,cinematographer,director of photography,
3,tt0000002,1,nm0721526,director,,
4,tt0000002,2,nm1335271,composer,,


In [81]:
#df5 = pd.merge(df4,df_crew,how="left",left_on="tconst",right_on="tconst")
df5 = pd.merge(df4,df_actors,how="left",left_on="tconst",right_on="tconst")

In [82]:
df5.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,titleId,...,Total Gross,Release Date,Distributor,Estimated,Year,ordering_y,nconst,category,job,characters
0,tt1179933,movie,10 Cloverfield Lane,10 Cloverfield Lane,0.0,2016.0,,103,"Action,Drama,Horror",tt1179933,...,"$72,082,998",Mar 11,Paramount Pictures,False,2016.0,10.0,nm6618222,producer,producer,
1,tt1179933,movie,10 Cloverfield Lane,10 Cloverfield Lane,0.0,2016.0,,103,"Action,Drama,Horror",tt1179933,...,"$72,082,998",Mar 11,Paramount Pictures,False,2016.0,1.0,nm0000422,actor,,"[""Howard""]"
2,tt1179933,movie,10 Cloverfield Lane,10 Cloverfield Lane,0.0,2016.0,,103,"Action,Drama,Horror",tt1179933,...,"$72,082,998",Mar 11,Paramount Pictures,False,2016.0,2.0,nm0935541,actress,,"[""Michelle""]"
3,tt1179933,movie,10 Cloverfield Lane,10 Cloverfield Lane,0.0,2016.0,,103,"Action,Drama,Horror",tt1179933,...,"$72,082,998",Mar 11,Paramount Pictures,False,2016.0,3.0,nm0302330,actor,,"[""Emmett""]"
4,tt1179933,movie,10 Cloverfield Lane,10 Cloverfield Lane,0.0,2016.0,,103,"Action,Drama,Horror",tt1179933,...,"$72,082,998",Mar 11,Paramount Pictures,False,2016.0,4.0,nm0341174,actor,,"[""Driver""]"


In [None]:
# director field to array
# df5["directors"] = df5["directors"].apply(lambda x:x.split(","))
# # explode directors
# df5 = df5.explode('directors')
# df5.head()

### Retrieve names from the ids

In [83]:
df_name = pd.read_csv('./data/name.basics.tsv.gz', compression='gzip', header=0, sep='\t', na_values='\\N')

In [84]:
df6 = pd.merge(df5,df_name,how="inner",left_on="nconst",right_on="nconst")
df6.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,titleId,...,ordering_y,nconst,category,job,characters,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,tt1179933,movie,10 Cloverfield Lane,10 Cloverfield Lane,0.0,2016.0,,103.0,"Action,Drama,Horror",tt1179933,...,10.0,nm6618222,producer,producer,,Lindsey Weber,,,producer,"tt1179933,tt2660888,tt4530422,tt2548396"
1,tt4530422,movie,Overlord,Overlord,0.0,2018.0,,110.0,"Action,Horror,Sci-Fi",tt4530422,...,9.0,nm6618222,producer,producer,,Lindsey Weber,,,producer,"tt1179933,tt2660888,tt4530422,tt2548396"
2,tt1179933,movie,10 Cloverfield Lane,10 Cloverfield Lane,0.0,2016.0,,103.0,"Action,Drama,Horror",tt1179933,...,1.0,nm0000422,actor,,"[""Howard""]",John Goodman,1952.0,,"actor,soundtrack,producer","tt0101410,tt1024648,tt1179933,tt1907668"
3,tt2406566,movie,Atomic Blonde,Atomic Blonde,0.0,2017.0,,115.0,"Action,Thriller",tt2406566,...,3.0,nm0000422,actor,,"[""Emmett Kurzfeld""]",John Goodman,1952.0,,"actor,soundtrack,producer","tt0101410,tt1024648,tt1179933,tt1907668"
4,tt5968394,movie,Captive State,Captive State,0.0,2019.0,,109.0,"Action,Horror,Sci-Fi",tt5968394,...,1.0,nm0000422,actor,,"[""William Mulligan""]",John Goodman,1952.0,,"actor,soundtrack,producer","tt0101410,tt1024648,tt1179933,tt1907668"


In [85]:
df6.to_csv("movie_data_2015_2020.csv")