In [1]:
import pandas as pd
import numpy as np

from ada_config.config import CONFIG

# Analyzing the IMDB Datasets

In order to merge the IMDB non-commercial datasets with our CMU and TMDB datasets at hand, we first need to bring the IMDB datasets into a cleaner format. Thus, our first step will be to analyze two separate IMDB datasets and select records from them that we are interested in.

**1. IMDB Basics Dataset (title.basics.tsv)**\
This dataset incorporates many useful aspects about movies, TV series, and other media recorded on the official IMDB website. We will only filter out the movies since our analysis involves movies only.

In [2]:
imdb_basics_df = pd.read_csv(CONFIG["imdb_path"] / "title.basics.tsv", sep="\t", header=0, na_values="\\N")

  imdb_basics_df = pd.read_csv(CONFIG["imdb_path"] / "title.basics.tsv", sep="\t", header=0, na_values="\\N")


In [3]:
imdb_basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1.0,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5.0,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0.0,1892.0,,5.0,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12.0,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1.0,"Comedy,Short"


In [4]:
# Check the number of entries per each category such as short film, movie, TV series
imdb_basics_df["titleType"].value_counts()

titleType
tvEpisode       8668387
short           1031702
movie            699425
video            301442
tvSeries         273556
tvMovie          149061
tvMiniSeries      58248
tvSpecial         50476
videoGame         40801
tvShort           10475
tvPilot               1
Name: count, dtype: int64

In [5]:
# Drop the endYear column since it is only useful for TV-series type of entries
imdb_basics_df.drop(columns=["endYear"], inplace=True)

In [6]:
# Filter out movies
filter_values = ["short", "movie", "tvMovie"]
movie_basics_df = imdb_basics_df[imdb_basics_df["titleType"].isin(filter_values)].copy(deep=True)

movie_basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,1.0,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,5.0,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0.0,1892.0,5.0,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,12.0,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,1.0,"Comedy,Short"
...,...,...,...,...,...,...,...,...
11283514,tt9916730,movie,6 Gunn,6 Gunn,0.0,2017.0,116,Drama
11283524,tt9916754,movie,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,0.0,2013.0,49,Documentary
11283525,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0.0,2019.0,,Short
11283529,tt9916764,short,38,38,0.0,2018.0,,Short


In [7]:
# Display the percentage of NaN values in each column
movie_basics_df.isnull().sum() / len(movie_basics_df) * 100

tconst             0.000000
titleType          0.000000
primaryTitle       0.000319
originalTitle      0.000319
isAdult            0.000000
startYear          7.699390
runtimeMinutes    35.933109
genres             4.725964
dtype: float64

In [8]:
# Map the values under isAdult column from {0, 1} to {False, True}
movie_basics_df["isAdult"] = movie_basics_df["isAdult"].apply(lambda x: False if x == 0 else True)

# Rename the columns in a format more suitable to the ones in CMU and TMDB datasets
movie_basics_df.rename(columns={"startYear": "imdb_year"}, inplace=True)

movie_basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,imdb_year,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,False,1894.0,1.0,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,False,1892.0,5.0,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,False,1892.0,5.0,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,False,1892.0,12.0,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,False,1893.0,1.0,"Comedy,Short"
...,...,...,...,...,...,...,...,...
11283514,tt9916730,movie,6 Gunn,6 Gunn,False,2017.0,116,Drama
11283524,tt9916754,movie,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,False,2013.0,49,Documentary
11283525,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,False,2019.0,,Short
11283529,tt9916764,short,38,38,False,2018.0,,Short


**2. IMDB Ratings Dataset (title.ratings.tsv)** \
This dataset includes information about the rating an IMDB entry has and the number of voters that have rated in this process.

In [9]:
imdb_ratings_df = pd.read_csv(CONFIG["imdb_path"] / "title.ratings.tsv", sep="\t", header=0)

In [12]:
imdb_ratings_df

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2108
1,tt0000002,5.6,283
2,tt0000003,6.5,2131
3,tt0000004,5.4,182
4,tt0000005,6.2,2860
...,...,...,...
1507417,tt9916730,7.0,12
1507418,tt9916766,7.1,24
1507419,tt9916778,7.2,37
1507420,tt9916840,6.9,11


In [10]:
# Display the percentage of NaN values in each column
imdb_ratings_df.isnull().sum() / len(imdb_ratings_df) * 100

tconst           0.0
averageRating    0.0
numVotes         0.0
dtype: float64

Now, since our main purpose of cleaning the IMDB datasets is to enrich those columns in CMU and TMDB datasets under which there are a large number of NaN values, and since the **averageRating** and **numVotes** are the two columns with highest proportion of NaN values in TMDB dataset, we are going to perform a right-outer join between two dataframes, namely *movie_basics_df* and *imdb_ratings_df*. 

In [14]:
merged_imdb_df = pd.merge(movie_basics_df, imdb_ratings_df, how="right", on="tconst")

merged_imdb_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,imdb_year,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,False,1894.0,1.0,"Documentary,Short",5.7,2108
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,False,1892.0,5.0,"Animation,Short",5.6,283
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,False,1892.0,5.0,"Animation,Comedy,Romance",6.5,2131
3,tt0000004,short,Un bon bock,Un bon bock,False,1892.0,12.0,"Animation,Short",5.4,182
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,False,1893.0,1.0,"Comedy,Short",6.2,2860
...,...,...,...,...,...,...,...,...,...,...
1507417,tt9916730,movie,6 Gunn,6 Gunn,False,2017.0,116,Drama,7.0,12
1507418,tt9916766,,,,,,,,7.1,24
1507419,tt9916778,,,,,,,,7.2,37
1507420,tt9916840,,,,,,,,6.9,11


In [15]:
merged_imdb_df.to_csv(CONFIG["imdb_path"] / "imdb_movies_data.csv", index=False)