In [89]:
import pandas as pd

# Notebook to create a meaningful movie data set
For now data comes from 2 sources
- the imdb data sets to download [link](https://datasets.imdbws.com/) (not stored on github, to run the notebook -> download relevant data). Contains many info ratings etc but not box-office and in a not convenient way (SQL storage)
- boxoffice stats, for now from [link](https://www.boxofficemojo.com/year/2015/)

**Goal**: for each 2015-2020 US movie get ratings, boxoffice, director names... 

## Preprocessing of some imdb data sets
### Keep relevant years

In [90]:
df1 = pd.read_csv('./data/title.basics.tsv.gz', compression='gzip', header=0, sep='\t', na_values='\\N')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [91]:
# keep only movies and start year >= 2015
df1 = df1[(df1['startYear']>=2015)&(df1['titleType']=="movie")]
df1.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11059,tt0011216,movie,Spanish Fiesta,La fête espagnole,0.0,2019.0,,67.0,Drama
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0.0,2019.0,,,"Action,Crime"
61124,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0.0,2020.0,,70.0,Drama
64098,tt0065392,movie,Bucharest Memories,Amintiri bucurestene,0.0,2020.0,,,Documentary
67677,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0.0,2018.0,,122.0,Drama


In [92]:
# get titles
df = pd.read_csv('./data/title.akas.tsv.gz', compression='gzip', header=0, sep='\t', na_values='\\N')
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0.0
1,tt0000001,2,Carmencita,DE,,,literal title,0.0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0.0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0.0


In [8]:
# Select only US
df2 = df[df["region"]=="US"]

In [93]:
# Merge
df_data = pd.merge(df1,df2,how="inner",left_on="tconst",right_on="titleId")
df_data.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0.0,2020.0,,70.0,Drama,tt0062336,5,The Tango of the Widower and Its Distorting Mi...,US,,imdbDisplay,,0.0
1,tt0065392,movie,Bucharest Memories,Amintiri bucurestene,0.0,2020.0,,,Documentary,tt0065392,3,Bucharest Memories,US,,imdbDisplay,,0.0
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0.0,2018.0,,122.0,Drama,tt0069049,3,The Other Side of the Wind,US,,imdbDisplay,,0.0
3,tt0100275,movie,The Wandering Soap Opera,La telenovela errante,0.0,2017.0,,80.0,"Comedy,Drama,Fantasy",tt0100275,6,The Wandering Soap Opera,US,,imdbDisplay,,0.0
4,tt0112502,movie,Bigfoot,Bigfoot,0.0,2017.0,,,"Horror,Thriller",tt0112502,3,Bigfoot,US,,imdbDisplay,,0.0


## Box-office data from the web

In [94]:
# scrap boxoffice
df_box = pd.read_html("https://www.boxofficemojo.com/year/2015/")[0]
df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0.0
1,tt0000001,2,Carmencita,DE,,,literal title,0.0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0.0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0.0


In [95]:
# merge boxoffice data
df4 = pd.merge(df_data,df_box,how="inner",left_on="primaryTitle",right_on="Release")
# remove duplicates: only keep first for now
df4 = df4.groupby("Release").agg(lambda x: x.iloc[0])
df4.head()

Unnamed: 0_level_0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,titleId,...,Rank,Genre,Budget,Running Time,Gross,Theaters,Total Gross,Release Date,Distributor,Estimated
Release,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90 Minutes in Heaven,tt4337690,movie,90 Minutes in Heaven,90 Minutes in Heaven,0.0,2015.0,,121.0,Drama,tt4337690,...,165,-,-,-,"$4,842,699",899,"$4,842,699",Sep 11,The Samuel Goldwyn Company,False
A Walk in the Woods,tt1178665,movie,A Walk in the Woods,A Walk in the Woods,0.0,2015.0,,104.0,"Adventure,Biography,Comedy",tt1178665,...,84,-,-,-,"$29,474,282",2158,"$29,504,281",Sep 2,Broad Green Pictures,False
A la mala,tt4357170,movie,A la mala,A la mala,0.0,2015.0,,99.0,Comedy,tt4357170,...,178,-,-,-,"$3,629,842",384,"$3,629,842",Feb 27,Lionsgate,False
Aloha,tt1243974,movie,Aloha,Aloha,0.0,2015.0,,105.0,"Comedy,Drama,Romance",tt1243974,...,104,-,-,-,"$21,067,116",2815,"$21,067,116",May 29,Sony Pictures Entertainment (SPE),False
Alvin and the Chipmunks: The Road Chip,tt2974918,movie,Alvin and the Chipmunks: The Road Chip,Alvin and the Chipmunks: The Road Chip,0.0,2015.0,,92.0,"Adventure,Animation,Comedy",tt2974918,...,53,-,-,-,"$55,575,427",3705,"$85,886,987",Dec 18,Twentieth Century Fox,False


## Back to IMDB to find director names

### Find film directors with ids

In [96]:
df_crew = pd.read_csv('./data/title.crew.tsv.gz', compression='gzip', header=0, sep='\t', na_values='\\N')
df_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,
3,tt0000004,nm0721526,
4,tt0000005,nm0005690,


In [97]:
df5 = pd.merge(df4,df_crew,how="left",left_on="tconst",right_on="tconst")
# director field to array
df5["directors"] = df5["directors"].apply(lambda x:x.split(","))
# explode directors
df5 = df5.explode('directors')
df5.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,titleId,...,Budget,Running Time,Gross,Theaters,Total Gross,Release Date,Distributor,Estimated,directors,writers
0,tt4337690,movie,90 Minutes in Heaven,90 Minutes in Heaven,0.0,2015.0,,121.0,Drama,tt4337690,...,-,-,"$4,842,699",899,"$4,842,699",Sep 11,The Samuel Goldwyn Company,False,nm0689187,"nm0689187,nm2926858,nm7714780"
1,tt1178665,movie,A Walk in the Woods,A Walk in the Woods,0.0,2015.0,,104.0,"Adventure,Biography,Comedy",tt1178665,...,-,-,"$29,474,282",2158,"$29,504,281",Sep 2,Broad Green Pictures,False,nm0477129,"nm1578335,nm2250139,nm0117445"
2,tt4357170,movie,A la mala,A la mala,0.0,2015.0,,99.0,Comedy,tt4357170,...,-,-,"$3,629,842",384,"$3,629,842",Feb 27,Lionsgate,False,nm0950426,"nm0411517,nm7053880"
3,tt1243974,movie,Aloha,Aloha,0.0,2015.0,,105.0,"Comedy,Drama,Romance",tt1243974,...,-,-,"$21,067,116",2815,"$21,067,116",May 29,Sony Pictures Entertainment (SPE),False,nm0001081,nm0001081
4,tt2974918,movie,Alvin and the Chipmunks: The Road Chip,Alvin and the Chipmunks: The Road Chip,0.0,2015.0,,92.0,"Adventure,Animation,Comedy",tt2974918,...,-,-,"$55,575,427",3705,"$85,886,987",Dec 18,Twentieth Century Fox,False,nm0065608,"nm0046564,nm0439739,nm0802020,nm1186373"


### Film directors: from id to names

In [98]:
df_name = pd.read_csv('./data/name.basics.tsv.gz', compression='gzip', header=0, sep='\t', na_values='\\N')

In [99]:
df6 = pd.merge(df5,df_name,how="inner",left_on="directors",right_on="nconst")
df6.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,titleId,...,Distributor,Estimated,directors,writers,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,tt4337690,movie,90 Minutes in Heaven,90 Minutes in Heaven,0.0,2015.0,,121.0,Drama,tt4337690,...,The Samuel Goldwyn Company,False,nm0689187,"nm0689187,nm2926858,nm7714780",nm0689187,Michael Polish,1970.0,,"director,producer,writer","tt0162830,tt0261755,tt0322659,tt1462411"
1,tt1178665,movie,A Walk in the Woods,A Walk in the Woods,0.0,2015.0,,104.0,"Adventure,Biography,Comedy",tt1178665,...,Broad Green Pictures,False,nm0477129,"nm1578335,nm2250139,nm0117445",nm0477129,Ken Kwapis,1957.0,,"director,producer,writer","tt3095080,tt1178665,tt4947608,tt1430615"
2,tt4357170,movie,A la mala,A la mala,0.0,2015.0,,99.0,Comedy,tt4357170,...,Lionsgate,False,nm0950426,"nm0411517,nm7053880",nm0950426,Pitipol Ybarra,,,"director,producer,camera_department","tt6156346,tt1068899,tt9893062,tt4357170"
3,tt1243974,movie,Aloha,Aloha,0.0,2015.0,,105.0,"Comedy,Drama,Romance",tt1243974,...,Sony Pictures Entertainment (SPE),False,nm0001081,nm0001081,nm0001081,Cameron Crowe,1957.0,,"writer,producer,director","tt0116695,tt0105415,tt0181875,tt0259711"
4,tt2974918,movie,Alvin and the Chipmunks: The Road Chip,Alvin and the Chipmunks: The Road Chip,0.0,2015.0,,92.0,"Adventure,Animation,Comedy",tt2974918,...,Twentieth Century Fox,False,nm0065608,"nm0046564,nm0439739,nm0802020,nm1186373",nm0065608,Walt Becker,1968.0,,"director,producer,writer","tt0283111,tt0976238,tt0486946,tt1684734"


In [85]:
df6.to_csv("movie_data_2015.csv")