# Filtering of IMDB dataset

In this notebook, we filter the [IMDB Non-Commercial Datasets](https://developer.imdb.com/non-commercial-datasets/) to remove all the titles that are not movies and all the people that did not work in any movie.\
Since the complete dataset does not fit in memory, after processing each part we deallocate it before importing new data.

In [1]:
import pandas as pd

DATA_PATH = "./../../Data/"

In [2]:
# Filter out non-movie entries from "title.basics.tsv"

title_basics = pd.read_csv(DATA_PATH + "title.basics.tsv", sep='\t',  na_values="\\N")
title_basics[title_basics["titleType"].isin(["movie", "tvMovie"])].to_csv(DATA_PATH + "title.basics.onlymovies.tsv", sep='\t', index=False)

In [51]:
# Define a set of imdbIDs of movies to use to filter the rest of the data files
# Filter "title.crew.tsv"

del title_basics
title_basics = pd.read_csv(DATA_PATH + "title.basics.onlymovies.tsv", sep='\t')
titles = set(title_basics["tconst"])
del title_basics

title_crew = pd.read_csv(DATA_PATH + "title.crew.tsv", sep='\t',  na_values="\\N")
title_crew[title_crew["tconst"].isin(titles)].to_csv(DATA_PATH + "title.crew.onlymovies.tsv", sep='\t', index=False)

In [52]:
# Start defining a set of imdbIDs of people to use to filter "name.basics.tsv"
# Filter "title.principals.tsv"

del title_crew
title_crew = pd.read_csv(DATA_PATH + "title.crew.onlymovies.tsv", sep='\t',  na_values="\\N")
people = set(title_crew["directors"]).union(set(title_crew["writers"]))
del title_crew

title_principals = pd.read_csv(DATA_PATH + "title.principals.tsv", sep='\t',  na_values="\\N")
title_principals[title_principals["tconst"].isin(titles)].to_csv(DATA_PATH + "title.principals.onlymovies.tsv", sep='\t', index=False)

In [56]:
# Finish defining a set of imdbIDs of people to use to filter "name.basics.tsv"
# Filter "title.ratings.tsv"

del title_principals
title_principals = pd.read_csv(DATA_PATH + "title.principals.onlymovies.tsv", sep='\t',  na_values="\\N")
people = people.union(set(title_principals["nconst"]))
del title_principals

title_ratings = pd.read_csv(DATA_PATH + "title.ratings.tsv", sep='\t',  na_values="\\N")
title_ratings[title_ratings["tconst"].isin(titles)].to_csv(DATA_PATH + "title.ratings.onlymovies.tsv", sep='\t', index=False)

In [58]:
# Filter "name.basics.tsv"

del title_ratings

name_basics = pd.read_csv(DATA_PATH + "name.basics.tsv", sep='\t',  na_values="\\N")
name_basics[name_basics["nconst"].isin(people)].to_csv(DATA_PATH + "name.basics.onlymovies.tsv", sep='\t', index=False)

In [3]:
# Filter "title.akas.tsv"

del name_basics

title_akas = pd.read_csv(DATA_PATH + "title.akas.tsv", sep='\t',  na_values="\\N")
title_akas[title_akas["titleId"].isin(titles)].to_csv(DATA_PATH + "title.akas.onlymovies.tsv", sep='\t', index=False)