In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from helpers import *

import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
plt.rcdefaults()
%load_ext autoreload
%autoreload 2

In [2]:
DATA_FOLDER = "../Data/"

imdb_basics_raw = pd.read_csv(
    DATA_FOLDER + "IMDb/title.basics.tsv.gz",
    sep="\t",
    compression="gzip",
)
imdb_ratings_raw = pd.read_csv(
    DATA_FOLDER + "IMDb/title.ratings.tsv.gz",
    sep="\t",
    compression="gzip",
)
movies_raw = pd.read_csv(DATA_FOLDER + "MovieSummaries/movie.metadata.tsv", sep="\t")

In [3]:
movies_raw.columns = [
    "wikiID",
    "fbID",
    "name",
    "releaseDate",
    "boxOffice",
    "runtime",
    "languages",
    "countries",
    "genres",
]

imdb_basics_raw.rename(
    columns={
        "primaryTitle": "name",
        "startYear": "releaseYear",
        "runtimeMinutes": "runtime",
    },
    inplace=True
)

In [4]:
movies = movies_raw.copy()
imdb_basics = imdb_basics_raw.copy()
imdb_ratings = imdb_ratings_raw.copy()

In [5]:
# Merge both imdb datasets
imdb = pd.merge(imdb_basics, imdb_ratings, on='tconst', how='left')
imdb.rename(columns={"tconst": "imdbID"}, inplace=True)

In [6]:
imdb["releaseYear"] = pd.to_numeric(imdb["releaseYear"], errors="coerce")

movies["releaseMonth"] = pd.to_datetime(
    movies["releaseDate"],
    errors="coerce",
).dt.month

movies["releaseYear"] = pd.to_datetime(
    movies["releaseDate"], errors="coerce", format="mixed"
).dt.year

movies.drop(columns=["releaseDate"], inplace=True)

In [7]:
# keep only movies and shorts
print(imdb['titleType'].unique())
imdb = imdb[imdb['titleType'].isin(['movie', 'short', 'tvMovie', 'tvShort'])]

['short' 'movie' 'tvShort' 'tvMovie' 'tvSeries' 'tvEpisode' 'tvMiniSeries'
 'tvSpecial' 'video' 'videoGame' 'tvPilot']


In [8]:
# remove data for movies with no name
print(imdb.name.isna().sum())
print(movies.name.isna().sum())
imdb.dropna(subset='name', inplace=True)

4
0


In [9]:
# standardize movie titles
movies['name'] = movies['name'].apply(standardize_str)
imdb['name'] = imdb['name'].apply(standardize_str)

In [10]:
movies.drop_duplicates(subset=["name", "releaseMonth", "releaseYear"], inplace=True)
imdb.drop_duplicates(subset=["name", "releaseYear"], inplace=True)

In [11]:
imdb['runtime'] = pd.to_numeric(imdb['runtime'], errors="coerce")

In [12]:
# merge on runtime (bad)
movies_imdb_bad = pd.merge(
    movies,
    imdb,
    how="left",
    on=["name", "releaseYear", "runtime"],
    suffixes=["Cmu", "Imdb"],
)
print(movies_imdb_bad.shape, movies.shape)
print(f"proportion of IMDb IDs: {movies_imdb_bad.imdbID.notna().sum() / movies_imdb_bad.shape[0]:.3f}")
print(f"proportion of IMDb ratings: {movies_imdb_bad.averageRating.notna().sum() / movies_imdb_bad.shape[0]:.3f}")
print(f"proportion of non-null runtimes: {movies_imdb_bad.runtime.notna().sum() / movies_imdb_bad.shape[0]:.3f}")

(81626, 18) (81626, 10)
proportion of IMDb IDs: 0.381
proportion of IMDb ratings: 0.337
proportion of non-null runtimes: 0.750


In [15]:
movies_imdb = pd.merge(
    movies,
    imdb,
    how="left",
    on=["name", "releaseYear"],
    suffixes=["Cmu", "Imdb"],
)
print(movies_imdb.shape, movies.shape)

(81626, 19) (81626, 10)


In [16]:
# filter out imdb data for movies where runtime doesn't match (with tolerance)
runtime_tol = 5  # allow for 5 minutes of leeway in runtime

# when only one of the runtimes is nan, use the other
movies_imdb['runtimeCmu'] = np.where(
    # if runtimeCmu is nan but runtimeImdb is not, use runtimeImdb else keep runtimeCmu
    pd.isna(movies_imdb['runtimeCmu']) & pd.notna(movies_imdb['runtimeImdb']),
    movies_imdb['runtimeImdb'],
    movies_imdb['runtimeCmu']
)

# identify rows where both runtimes are not NaN and the difference is not within tolerance
mask = pd.notna(movies_imdb['runtimeCmu']) & pd.notna(movies_imdb['runtimeImdb']) & (
        abs(movies_imdb['runtimeCmu'] - movies_imdb['runtimeImdb']) > runtime_tol)

# remove IMDb data when the runtimes are not within tolerance
movies_imdb.loc[mask, ['imdbID', 'titleType', 'originalTitle', 'isAdult', 'endYear', 'genresImdb', 'averageRating',
                       'numVotes']] = np.nan

# remove runtimeImdb column
movies_imdb.drop(['runtimeImdb'], axis=1, inplace=True)
movies_imdb.rename(columns={'runtimeCmu': 'runtime'}, inplace=True)

print(f"proportion of IMDb IDs: {movies_imdb.imdbID.notna().sum() / movies_imdb.shape[0]:.3f}")
print(f"proportion of IMDb ratings: {movies_imdb.averageRating.notna().sum() / movies_imdb.shape[0]:.3f}")
print(f"proportion of non-null runtimes: {movies_imdb.runtime.notna().sum() / movies_imdb.shape[0]:.3f}")

proportion of IMDb IDs: 0.607
proportion of IMDb ratings: 0.539
proportion of non-null runtimes: 0.844


In [17]:
movies_imdb.query("`name`.str.contains('potter')")[['name', 'releaseYear', 'runtime', 'averageRating']]

Unnamed: 0,name,releaseYear,runtime,averageRating
7355,harry potter and the halfblood prince,2009.0,153.0,7.6
10817,harry potter and the goblet of fire,2005.0,156.0,7.7
17918,harry potter and the order of the phoenix,2007.0,137.0,7.5
30262,harry potter and the deathly hallows part 2,2011.0,130.0,8.1
41171,harry potter and the chamber of secrets,2002.0,174.0,
41222,harry potter and the prisoner of azkaban,2004.0,141.0,7.9
44049,miss potter,2006.0,93.0,7.0
46421,private potter,1962.0,89.0,
48798,at the potters wheel,1914.0,11.0,
59521,harry potter and the deathly hallows part i,2010.0,146.0,


In [32]:
# better merge w/ fuzzy matching
from rapidfuzz import process, fuzz
from tqdm import tqdm


# function to find the best match with a given minimum similarity threshold for a given movie title
def get_best_match(row, grouped_df, scorer, threshold=95):
    # filter the imdb dataframe to check only for movies with the same release year
    same_year_movies = grouped_df.get_group(row['releaseYear'])['name'] if (row[
                                                                                'releaseYear'] in grouped_df.groups) else []
    # find the best match for the given movie title
    best_match = process.extractOne(row['name'], same_year_movies, scorer=scorer)
    # return the best match if its similarity is above the threshold
    return best_match[0] if best_match and best_match[1] >= threshold else np.nan


# group the imdb dataframe by releaseYear to speed up the search
grouped_imdb = imdb.groupby('releaseYear')
# enable tqdm pandas integration
tqdm.pandas()
# compute the best match for each title in movies
movies['matched_title'] = movies.progress_apply(get_best_match, args=(grouped_imdb, fuzz.WRatio), axis=1, desc="Finding best match")

# merge on the matched titles and releaseYear
movies_imdb = pd.merge(movies, imdb, left_on=['matched_title', 'releaseYear'], right_on=['name', 'releaseYear'],
                       how='left', suffixes=['Cmu', 'Imdb'])
# drop extra columns
movies_imdb.drop(['matched_title', 'nameImdb'], axis=1, inplace=True)
movies_imdb.rename(columns={'nameCmu': 'name'}, inplace=True)

print(movies_imdb.shape, movies.shape)

100%|██████████| 81626/81626 [11:07<00:00, 122.24it/s]


(81626, 19) (81626, 11)


In [33]:
# filter out imdb data for movies where runtime doesn't match (with tolerance)
runtime_tol = 5  # allow for 5 minutes of leeway in runtime

# when only one of the runtimes is nan, use the other
movies_imdb['runtimeCmu'] = np.where(
    # if runtimeCmu is nan but runtimeImdb is not, use runtimeImdb else keep runtimeCmu
    pd.isna(movies_imdb['runtimeCmu']) & pd.notna(movies_imdb['runtimeImdb']),
    movies_imdb['runtimeImdb'],
    movies_imdb['runtimeCmu']
)

# identify rows where both runtimes are not NaN and the difference is not within tolerance
mask = pd.notna(movies_imdb['runtimeCmu']) & pd.notna(movies_imdb['runtimeImdb']) & (
        abs(movies_imdb['runtimeCmu'] - movies_imdb['runtimeImdb']) > runtime_tol)

# remove IMDb data when the runtimes are not within tolerance
movies_imdb.loc[mask, ['imdbID', 'titleType', 'originalTitle', 'isAdult', 'endYear', 'genresImdb', 'averageRating',
                       'numVotes']] = np.nan

# remove runtimeImdb column
movies_imdb.drop(['runtimeImdb'], axis=1, inplace=True)
movies_imdb.rename(columns={'runtimeCmu': 'runtime'}, inplace=True)

print(f"proportion of IMDb IDs: {movies_imdb.imdbID.notna().sum() / movies_imdb.shape[0]:.3f}")
print(f"proportion of IMDb ratings: {movies_imdb.averageRating.notna().sum() / movies_imdb.shape[0]:.3f}")
print(f"proportion of non-null runtimes: {movies_imdb.runtime.notna().sum() / movies_imdb.shape[0]:.3f}")

movies_imdb.head(10)

proportion of IMDb IDs: 0.621
proportion of IMDb ratings: 0.559
proportion of non-null runtimes: 0.849


Unnamed: 0,wikiID,fbID,name,boxOffice,runtime,languages,countries,genresCmu,releaseMonth,releaseYear,imdbID,titleType,originalTitle,isAdult,endYear,genresImdb,averageRating,numVotes
0,3196793,/m/08yl5d,getting away with murder the jonbenet ramsey m...,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",2.0,2000.0,tt0245916,tvMovie,Getting Away with Murder: The JonBenet Ramsey ...,0.0,\N,Drama,6.0,69.0
1,28463795,/m/0crgdbh,brun bitter,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",,1988.0,tt0094806,movie,Brun bitter,0.0,\N,"Crime,Drama",5.6,40.0
2,9363483,/m/0285_cd,white of the eye,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",,1987.0,tt0094320,movie,White of the Eye,0.0,\N,"Horror,Mystery,Thriller",6.1,2885.0
3,261236,/m/01mrr1,a woman in flames,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",,1983.0,tt0083949,movie,Die flambierte Frau,0.0,\N,Drama,6.0,621.0
4,13696889,/m/03cfc81,the gangsters,,35.0,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen...",5.0,1913.0,tt0002894,short,The Gangsters,0.0,\N,"Comedy,Short",6.8,16.0
5,18998739,/m/04jcqvw,the sorcerers apprentice,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...",,2002.0,,,,,,,,
6,10408933,/m/02qc0j7,alexanders ragtime band,3600000.0,106.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/04t36"": ""Musical"", ""/m/01z4y"": ""Comedy"", ...",8.0,1938.0,tt0029852,movie,Alexander's Ragtime Band,0.0,\N,"Drama,Music,Musical",6.8,2264.0
7,9997961,/m/06_y2j7,contigo y aqui,,70.0,"{""/m/06nm1"": ""Spanish Language""}","{""/m/0jgd"": ""Argentina""}","{""/m/04t36"": ""Musical"", ""/m/07s9rl0"": ""Drama"",...",,1974.0,tt0200545,movie,Contigo y aquí,0.0,\N,"Comedy,Drama,Musical",,
8,2345652,/m/075f66,city of the dead,,76.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/03npn"": ""Horror"", ""/m/0fdjb"": ""Supernatur...",,1960.0,tt0053719,movie,The City of the Dead,0.0,\N,"Horror,Mystery,Thriller",6.7,8261.0
9,175026,/m/017n1p,sarah and son,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama"", ""/m/01g6gs"": ""Black-an...",,1930.0,tt0021335,movie,Sarah and Son,0.0,\N,"Drama,Romance",5.4,298.0


In [34]:
movies_imdb.query("`name`.str.contains('harry potter')")[
    ['name', 'releaseYear', 'runtime', 'averageRating']]

Unnamed: 0,name,releaseYear,runtime,averageRating
7355,harry potter and the halfblood prince,2009.0,153.0,7.6
10817,harry potter and the goblet of fire,2005.0,156.0,7.7
17918,harry potter and the order of the phoenix,2007.0,137.0,7.5
30262,harry potter and the deathly hallows part 2,2011.0,130.0,8.1
41171,harry potter and the chamber of secrets,2002.0,174.0,
41222,harry potter and the prisoner of azkaban,2004.0,141.0,7.9
59521,harry potter and the deathly hallows part i,2010.0,146.0,7.7
60922,harry potter and the secret chamberpot of azer...,,16.0,
71364,harry potter and the philosophers stone,2001.0,153.0,


In [35]:
movies_imdb['languages'] = movies_imdb['languages'].apply(parse_dict)
movies_imdb['countries'] = movies_imdb['countries'].apply(parse_dict)
movies_imdb['genresCmu'] = movies_imdb['genresCmu'].apply(parse_dict)

movies_imdb['genresImdb'] = movies_imdb['genresImdb'].apply(
    lambda x: x.split(',') if not (isinstance(x, float) or x == "\\N") else np.nan)

In [36]:
print(movies.shape)
print(movies_imdb.shape)
movies_imdb.head(10)

(81626, 11)
(81626, 18)


Unnamed: 0,wikiID,fbID,name,boxOffice,runtime,languages,countries,genresCmu,releaseMonth,releaseYear,imdbID,titleType,originalTitle,isAdult,endYear,genresImdb,averageRating,numVotes
0,3196793,/m/08yl5d,getting away with murder the jonbenet ramsey m...,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]",2.0,2000.0,tt0245916,tvMovie,Getting Away with Murder: The JonBenet Ramsey ...,0.0,\N,[Drama],6.0,69.0
1,28463795,/m/0crgdbh,brun bitter,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",,1988.0,tt0094806,movie,Brun bitter,0.0,\N,"[Crime, Drama]",5.6,40.0
2,9363483,/m/0285_cd,white of the eye,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",,1987.0,tt0094320,movie,White of the Eye,0.0,\N,"[Horror, Mystery, Thriller]",6.1,2885.0
3,261236,/m/01mrr1,a woman in flames,,106.0,[German Language],[Germany],[Drama],,1983.0,tt0083949,movie,Die flambierte Frau,0.0,\N,[Drama],6.0,621.0
4,13696889,/m/03cfc81,the gangsters,,35.0,"[Silent film, English Language]",[United States of America],"[Short Film, Silent film, Indie, Black-and-whi...",5.0,1913.0,tt0002894,short,The Gangsters,0.0,\N,"[Comedy, Short]",6.8,16.0
5,18998739,/m/04jcqvw,the sorcerers apprentice,,86.0,[English Language],[South Africa],"[Family Film, Fantasy, Adventure, World cinema]",,2002.0,,,,,,,,
6,10408933,/m/02qc0j7,alexanders ragtime band,3600000.0,106.0,[English Language],[United States of America],"[Musical, Comedy, Black-and-white]",8.0,1938.0,tt0029852,movie,Alexander's Ragtime Band,0.0,\N,"[Drama, Music, Musical]",6.8,2264.0
7,9997961,/m/06_y2j7,contigo y aqui,,70.0,[Spanish Language],[Argentina],"[Musical, Drama, Comedy]",,1974.0,tt0200545,movie,Contigo y aquí,0.0,\N,"[Comedy, Drama, Musical]",,
8,2345652,/m/075f66,city of the dead,,76.0,[English Language],[United Kingdom],"[Horror, Supernatural]",,1960.0,tt0053719,movie,The City of the Dead,0.0,\N,"[Horror, Mystery, Thriller]",6.7,8261.0
9,175026,/m/017n1p,sarah and son,,86.0,[English Language],[United States of America],"[Drama, Black-and-white]",,1930.0,tt0021335,movie,Sarah and Son,0.0,\N,"[Drama, Romance]",5.4,298.0


In [13]:
characters_raw = pd.read_csv(DATA_FOLDER + "MovieSummaries/character.metadata.tsv", sep='\t')
oscars_raw = pd.read_csv(DATA_FOLDER + "Oscars/the_oscar_award.csv", sep=',')

In [14]:
characters_raw.columns = [
    "wikiID",
    "fbID",
    "releaseDate",
    "characterName",
    "DOB",
    "gender",
    "height",
    "ethnicity",
    "name",
    "ageAtMovieRelease",
    "fbCharacterMap",
    "fbCharacterID",
    "fbActorID",
]

In [15]:
oscars_raw = oscars_raw.drop(columns=["ceremony", "year_ceremony", "film"])
oscars_raw.rename(columns={"year_film": "releaseYear", "category": "oscarCategory", "winner": "oscarWinner"},
                  inplace=True)

In [16]:
characters = characters_raw.copy()
oscars = oscars_raw.copy()

In [17]:
characters["releaseYear"] = pd.to_datetime(
    characters["releaseDate"],
    errors="coerce",
    yearfirst=True,
).dt.year
characters["releaseMonth"] = pd.to_datetime(
    characters["releaseDate"],
    errors="coerce",
    yearfirst=True,
).dt.month
characters.drop(columns=["releaseDate"], inplace=True)

In [18]:
characters_oscars = pd.merge(characters, oscars, on=["name", 'releaseYear'], how="left")
print(characters_oscars.shape)
characters_oscars.head(10)

(450713, 16)


Unnamed: 0,wikiID,fbID,characterName,DOB,gender,height,ethnicity,name,ageAtMovieRelease,fbCharacterMap,fbCharacterID,fbActorID,releaseYear,releaseMonth,oscarCategory,oscarWinner
0,975900,/m/03vyhn,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,2001.0,8.0,,
1,975900,/m/03vyhn,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,2001.0,8.0,,
2,975900,/m/03vyhn,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,2001.0,8.0,,
3,975900,/m/03vyhn,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,2001.0,8.0,,
4,975900,/m/03vyhn,Commander Helena Braddock,1949-05-26,F,1.727,/m/0x67,Pam Grier,52.0,/m/02vdcfp,/m/0bgchnd,/m/0418ft,2001.0,8.0,,
5,975900,/m/03vyhn,Whitlock,1945-08-02,F,1.753,,Joanna Cassidy,56.0,/m/02vd6kw,/m/0bgchmx,/m/06lj1m,2001.0,8.0,,
6,975900,/m/03vyhn,Big Daddy Mars,,M,,,Richard Cetrone,,/m/0bgchsy,/m/0bgcht0,/m/0bgcht7,2001.0,8.0,,
7,975900,/m/03vyhn,Michael Descanso,1971-03-20,M,1.892,,Liam Waite,30.0,/m/03jqhb0,/m/0bgchs4,/m/0ks8b0,2001.0,8.0,,
8,975900,/m/03vyhn,Uno,,M,,,Duane Davis,,/m/0bgchtj,/m/0bgchtm,/m/03nrwdy,2001.0,8.0,,
9,975900,/m/03vyhn,Dos,,M,,,Lobo Sebastian,,/m/0bgchsc,/m/0bgchsg,/m/0bgchsp,2001.0,8.0,,


In [37]:
budget_raw = pd.read_csv(
    DATA_FOLDER + "Budget/movies_metadata.csv.zip",
    compression="zip",
)

In [38]:
budget = budget_raw[
    [
        "budget",
        "imdb_id",
        "original_title",
        "popularity",
        "revenue",
        "vote_average",
        "vote_count",
        "release_date",
    ]
].copy()

In [39]:
budget.columns = [
    "budget",
    "imdbID",
    "name",
    "popularity",
    "revenue",
    "voteAverage",
    "voteCount",
    "releaseDate",
]

In [40]:
budget.drop(columns=["releaseDate", "name", "revenue", "voteAverage", "voteCount"], inplace=True)

In [41]:
budget["budget"] = pd.to_numeric(budget["budget"], errors="coerce")

In [42]:
budget.head()

Unnamed: 0,budget,imdbID,popularity
0,30000000.0,tt0114709,21.946943
1,65000000.0,tt0113497,17.015539
2,0.0,tt0113228,11.7129
3,16000000.0,tt0114885,3.859495
4,0.0,tt0113041,8.387519


In [43]:
# remove imdbID nan entries to avoid issues when merging
print(budget.imdbID.isnull().sum())
budget.dropna(subset=['imdbID'], inplace=True)
print(budget.shape)

17
(45449, 3)


In [44]:
# remove duplicate imdbID entries
print(budget.imdbID.duplicated().sum())
budget.drop_duplicates(subset='imdbID', inplace=True)
print(budget.shape)

32
(45417, 3)


In [45]:
# replace all zero budget entries to nan
print((budget['budget'] == 0).sum())
budget.loc[budget['budget'] == 0, 'budget'] = np.nan

36538


In [46]:
# check imdbID matches
print(budget['imdbID'].isin(movies_imdb['imdbID']).sum())
# check how many movies we can populate with budget data
budget.loc[budget['imdbID'].isin(movies_imdb['imdbID']), 'budget'].notnull().sum()

19159


5121

In [47]:
budget.head(10)

Unnamed: 0,budget,imdbID,popularity
0,30000000.0,tt0114709,21.946943
1,65000000.0,tt0113497,17.015539
2,,tt0113228,11.7129
3,16000000.0,tt0114885,3.859495
4,,tt0113041,8.387519
5,60000000.0,tt0113277,17.924927
6,58000000.0,tt0114319,6.677277
7,,tt0112302,2.561161
8,35000000.0,tt0114576,5.23158
9,58000000.0,tt0113189,14.686036


In [48]:
movies_imdb = pd.merge(movies_imdb, budget, on='imdbID', how='left')
movies_imdb.shape

(81626, 20)

In [49]:
# save preprocessed dataframe to csv
movies_imdb.to_csv(DATA_FOLDER + "movies_imdb.csv", index=False)

In [50]:
# retrieve dataframe as such
movies_imdb = pd.read_csv(DATA_FOLDER + 'movies_imdb.csv',
                          converters={'languages': parse_list, 'countries': parse_list, 'genresCmu': parse_list,
                                      'genresImdb': parse_list})
movies_imdb.head(10)

Unnamed: 0,wikiID,fbID,name,boxOffice,runtime,languages,countries,genresCmu,releaseMonth,releaseYear,imdbID,titleType,originalTitle,isAdult,endYear,genresImdb,averageRating,numVotes,budget,popularity
0,3196793,/m/08yl5d,getting away with murder the jonbenet ramsey m...,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]",2.0,2000.0,tt0245916,tvMovie,Getting Away with Murder: The JonBenet Ramsey ...,0.0,\N,[Drama],6.0,69.0,,
1,28463795,/m/0crgdbh,brun bitter,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",,1988.0,tt0094806,movie,Brun bitter,0.0,\N,"[Crime, Drama]",5.6,40.0,,
2,9363483,/m/0285_cd,white of the eye,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",,1987.0,tt0094320,movie,White of the Eye,0.0,\N,"[Horror, Mystery, Thriller]",6.1,2885.0,,3.121105
3,261236,/m/01mrr1,a woman in flames,,106.0,[German Language],[Germany],[Drama],,1983.0,tt0083949,movie,Die flambierte Frau,0.0,\N,[Drama],6.0,621.0,,
4,13696889,/m/03cfc81,the gangsters,,35.0,"[Silent film, English Language]",[United States of America],"[Short Film, Silent film, Indie, Black-and-whi...",5.0,1913.0,tt0002894,short,The Gangsters,0.0,\N,"[Comedy, Short]",6.8,16.0,,
5,18998739,/m/04jcqvw,the sorcerers apprentice,,86.0,[English Language],[South Africa],"[Family Film, Fantasy, Adventure, World cinema]",,2002.0,,,,,,,,,,
6,10408933,/m/02qc0j7,alexanders ragtime band,3600000.0,106.0,[English Language],[United States of America],"[Musical, Comedy, Black-and-white]",8.0,1938.0,tt0029852,movie,Alexander's Ragtime Band,0.0,\N,"[Drama, Music, Musical]",6.8,2264.0,2000000.0,0.632261
7,9997961,/m/06_y2j7,contigo y aqui,,70.0,[Spanish Language],[Argentina],"[Musical, Drama, Comedy]",,1974.0,tt0200545,movie,Contigo y aquí,0.0,\N,"[Comedy, Drama, Musical]",,,,
8,2345652,/m/075f66,city of the dead,,76.0,[English Language],[United Kingdom],"[Horror, Supernatural]",,1960.0,tt0053719,movie,The City of the Dead,0.0,\N,"[Horror, Mystery, Thriller]",6.7,8261.0,,1.514972
9,175026,/m/017n1p,sarah and son,,86.0,[English Language],[United States of America],"[Drama, Black-and-white]",,1930.0,tt0021335,movie,Sarah and Son,0.0,\N,"[Drama, Romance]",5.4,298.0,,
