In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings("ignore")

In [2]:
DATA_FOLDER = "../Data/"

imdb_basics_raw = pd.read_csv(
    DATA_FOLDER + "IMDb/title.basics.tsv.gz",
    sep="\t",
    compression="gzip",
)
imdb_ratings_raw = pd.read_csv(
    DATA_FOLDER + "IMDb/title.ratings.tsv.gz",
    sep="\t",
    compression="gzip",
)
movies_raw = pd.read_csv(DATA_FOLDER + "MovieSummaries/movie.metadata.tsv", sep="\t")

In [3]:
movies_raw.columns = [
    "wikiID",
    "fbID",
    "name",
    "releaseDate",
    "boxOffice",
    "runtime",
    "languages",
    "countries",
    "genres",
]

imdb_basics_raw.rename(
    columns={
        "primaryTitle": "name",
        "startYear": "releaseYear",
        "runtimeMinutes": "runtime",
    },
    inplace=True
)

In [4]:
movies = movies_raw.copy()
imdb_basics = imdb_basics_raw.copy()
imdb_ratings = imdb_ratings_raw.copy()

In [5]:
# Merge both imdb datasets
imdb = pd.merge(imdb_basics, imdb_ratings, on='tconst', how='left')

In [6]:
imdb.head()

Unnamed: 0,tconst,titleType,name,originalTitle,isAdult,releaseYear,endYear,runtime,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,2004.0
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.8,269.0
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",6.5,1900.0
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short",5.5,178.0
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",6.2,2685.0


In [7]:
imdb["releaseYear"] = pd.to_numeric(imdb["releaseYear"], errors="coerce")

movies["releaseMonth"] = pd.to_datetime(
    movies["releaseDate"],
    errors="coerce",
).dt.month

movies["releaseYear"] = pd.to_datetime(
    movies["releaseDate"], errors="coerce", format="mixed"
).dt.year

movies.drop(columns=["releaseDate"], inplace=True)

In [8]:
movies.drop_duplicates(subset=["name", "releaseMonth", "releaseYear"], inplace=True)
imdb.drop_duplicates(subset=["name", "releaseYear"], inplace=True)

In [9]:
imdb.runtime = pd.to_numeric(imdb.runtime, errors="coerce")

In [10]:
movies_imdb = pd.merge(movies, imdb, how="left", on=["name", "releaseYear", "runtime"], suffixes=["Cmu", "Imdb"])

In [11]:
# Function to extract genres from the string representation of dictionary-like data
def parse_dict(s):
    try:
        genres_dict = eval(s.replace('null', 'None'))
        return list(genres_dict.values())
    except:
        return None

movies_imdb['languages'] = movies_imdb['languages'].apply(parse_dict)
movies_imdb['countries'] = movies_imdb['countries'].apply(parse_dict)
movies_imdb['genresCmu'] = movies_imdb['genresCmu'].apply(parse_dict)

movies_imdb['genresImdb'] = movies_imdb['genresImdb'].apply(lambda x: x.split(',') if not isinstance(x, float) else np.nan)
movies_imdb.head(10)

Unnamed: 0,wikiID,fbID,name,boxOffice,runtime,languages,countries,genresCmu,releaseMonth,releaseYear,tconst,titleType,originalTitle,isAdult,endYear,genresImdb,averageRating,numVotes
0,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]",2.0,2000.0,,,,,,,,
1,28463795,/m/0crgdbh,Brun bitter,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",,1988.0,tt0094806,movie,Brun bitter,0.0,\N,"[Crime, Drama]",5.6,40.0
2,9363483,/m/0285_cd,White Of The Eye,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",,1987.0,,,,,,,,
3,261236,/m/01mrr1,A Woman in Flames,,106.0,[German Language],[Germany],[Drama],,1983.0,tt0083949,movie,Die flambierte Frau,0.0,\N,[Drama],6.0,621.0
4,13696889,/m/03cfc81,The Gangsters,,35.0,"[Silent film, English Language]",[United States of America],"[Short Film, Silent film, Indie, Black-and-whi...",5.0,1913.0,tt0002894,short,The Gangsters,0.0,\N,"[Comedy, Short]",6.8,16.0
5,18998739,/m/04jcqvw,The Sorcerer's Apprentice,,86.0,[English Language],[South Africa],"[Family Film, Fantasy, Adventure, World cinema]",,2002.0,,,,,,,,
6,10408933,/m/02qc0j7,Alexander's Ragtime Band,3600000.0,106.0,[English Language],[United States of America],"[Musical, Comedy, Black-and-white]",8.0,1938.0,tt0029852,movie,Alexander's Ragtime Band,0.0,\N,"[Drama, Music, Musical]",6.8,2264.0
7,9997961,/m/06_y2j7,Contigo y aquí,,,[Spanish Language],[Argentina],"[Musical, Drama, Comedy]",,1974.0,,,,,,,,
8,2345652,/m/075f66,City of the Dead,,76.0,[English Language],[United Kingdom],"[Horror, Supernatural]",,1960.0,,,,,,,,
9,175026,/m/017n1p,Sarah and Son,,86.0,[English Language],[United States of America],"[Drama, Black-and-white]",,1930.0,tt0021335,movie,Sarah and Son,0.0,\N,"[Drama, Romance]",5.4,298.0


In [12]:
print(movies.shape)
print(movies_imdb.shape)
movies_imdb.head()

(81630, 10)
(81630, 18)


Unnamed: 0,wikiID,fbID,name,boxOffice,runtime,languages,countries,genresCmu,releaseMonth,releaseYear,tconst,titleType,originalTitle,isAdult,endYear,genresImdb,averageRating,numVotes
0,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]",2.0,2000.0,,,,,,,,
1,28463795,/m/0crgdbh,Brun bitter,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",,1988.0,tt0094806,movie,Brun bitter,0.0,\N,"[Crime, Drama]",5.6,40.0
2,9363483,/m/0285_cd,White Of The Eye,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",,1987.0,,,,,,,,
3,261236,/m/01mrr1,A Woman in Flames,,106.0,[German Language],[Germany],[Drama],,1983.0,tt0083949,movie,Die flambierte Frau,0.0,\N,[Drama],6.0,621.0
4,13696889,/m/03cfc81,The Gangsters,,35.0,"[Silent film, English Language]",[United States of America],"[Short Film, Silent film, Indie, Black-and-whi...",5.0,1913.0,tt0002894,short,The Gangsters,0.0,\N,"[Comedy, Short]",6.8,16.0


In [13]:
movies_imdb.to_csv(DATA_FOLDER + "movies_imdb.tsv", sep='\t', index=False)