Before running this notebook, make sure to download datasets from kaggle using commands in Makefile :
- download-tmdb-movies-dataset
- download-full-tmdb-tv-shows-dataset

You may need first to get your api's credentials for kaggle first : [here](https://github.com/Kaggle/kaggle-api#api-credentials)

Alternatively you can download directly the datasets from kaggle website :
- [tmdb-movies-dataset](https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies)
- [full-tmdb-tv-shows-dataset](https://www.kaggle.com/datasets/asaniczka/full-tmdb-tv-shows-dataset-2023-150k-shows)

And put downloaded zip files into ./data folder and change KAGGLE_TMDB_MOVIES_DATASET_NAME & KAGGLE_TMDB_TVSHOWS_DATASET_NAME values if needed.

In [6]:
import os
import zipfile
from pathlib import Path

import pandas as pd

In [7]:
DATA_DIRECTORY = Path("../data").resolve()

KAGGLE_TMDB_MOVIES_DATASET_NAME = "tmdb-movies-dataset-2023-930k-movies.zip"
KAGGLE_TMDB_TVSHOWS_DATASET_NAME = "full-tmdb-tv-shows-dataset-2023-150k-shows.zip"

KAGGLE_TMDB_MOVIES_DATASET_PATH = DATA_DIRECTORY / KAGGLE_TMDB_MOVIES_DATASET_NAME
KAGGLE_TMDB_TVSHOWS_DATASET_PATH = DATA_DIRECTORY / KAGGLE_TMDB_TVSHOWS_DATASET_NAME

EXTRACT_MOVIES_ZIP_TO = DATA_DIRECTORY / "tmdb_movies"
EXTRACT_MOVIES_ZIP_TO.mkdir(exist_ok=True, parents=True)
EXTRACT_TWSHOWS_ZIP_TO = DATA_DIRECTORY / "tmdb_tvshows"
EXTRACT_TWSHOWS_ZIP_TO.mkdir(exist_ok=True, parents=True)
EXPORT_TMDB_SUBSETS_TO = DATA_DIRECTORY / "subsets"
EXPORT_TMDB_SUBSETS_TO.mkdir(exist_ok=True, parents=True)

MOVIES_COLUMNS_OF_INTEREST = ['title', 'original_title', 'release_date', 'production_countries', 'genres', 'production_companies', 'vote_average', 'revenue']
NB_MOVIES_SUBSET = 5000

In [8]:
with zipfile.ZipFile(KAGGLE_TMDB_MOVIES_DATASET_PATH, 'r') as f:
    f.extractall(path=EXTRACT_MOVIES_ZIP_TO)

df_movies = pd.read_csv(EXTRACT_MOVIES_ZIP_TO / os.listdir(EXTRACT_MOVIES_ZIP_TO)[0])

In [9]:
with zipfile.ZipFile(KAGGLE_TMDB_TVSHOWS_DATASET_PATH, 'r') as f:
    f.extractall(path=EXTRACT_TWSHOWS_ZIP_TO)

df_tvshows = pd.read_csv(EXTRACT_TWSHOWS_ZIP_TO / os.listdir(EXTRACT_TWSHOWS_ZIP_TO)[0])

In [10]:
df_movies.dropna(axis=0, how='any', subset=MOVIES_COLUMNS_OF_INTEREST, inplace=True)

df_movies_subset = df_movies[(df_movies['status'] == 'Released') & 
                             (~df_movies['adult']) &
                             (df_movies['release_date'] < '2024-03-01')].sort_values(by='release_date', ascending=False).iloc[0:NB_MOVIES_SUBSET]
          
df_movies_subset.to_csv(EXPORT_TMDB_SUBSETS_TO / "tmdb_movies_subset.csv")