Before running this notebook, make sure to download datasets from kaggle using commands in Makefile :
- download-tmdb-movies-dataset
- download-full-tmdb-tv-shows-dataset

You may need first to get your api's credentials for kaggle first : [here](https://github.com/Kaggle/kaggle-api#api-credentials)

Alternatively you can download directly the datasets from kaggle website :
- [tmdb-movies-dataset](https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies)
- [full-tmdb-tv-shows-dataset](https://www.kaggle.com/datasets/asaniczka/full-tmdb-tv-shows-dataset-2023-150k-shows)

And put downloaded zip files into ./data folder and change KAGGLE_TMDB_MOVIES_DATASET_NAME & KAGGLE_TMDB_TVSHOWS_DATASET_NAME values if needed.

In [1]:
import os
from zipfile import ZipFile
from pathlib import Path

import pandas as pd

In [2]:
KAGGLE_TMDB_MOVIES_DATASET_PATH = Path("../data/tmdb-movies-dataset-2023-930k-movies.zip").resolve()
KAGGLE_TMDB_TVSHOWS_DATASET_PATH = Path("../data/full-tmdb-tv-shows-dataset-2023-150k-shows.zip").resolve()

EXTRACT_MOVIES_ZIP_TO = Path("../data/tmdb_movies").resolve()
EXTRACT_TWSHOWS_ZIP_TO = Path("../data/tmdb_tvshows").resolve()

EXPORT_TMDB_SUBSETS_TO = Path("../data/tmdb_subsets").resolve()

MOVIES_COLUMNS_OF_INTEREST = ['title', 'original_title', 'release_date', 'production_countries', 'genres', 'production_companies']
NB_MOVIES_SUBSET = 5000

TVSHOWS_COLUMNS_OF_INTEREST = ['name', 'original_name', 'first_air_date', 'production_countries', 'genres', 'production_companies']
NB_TVSHOWS_SUBSET = 5000

In [3]:
EXTRACT_MOVIES_ZIP_TO.mkdir(exist_ok=True, parents=True)
EXTRACT_TWSHOWS_ZIP_TO.mkdir(exist_ok=True, parents=True)
EXPORT_TMDB_SUBSETS_TO.mkdir(exist_ok=True, parents=True)

In [4]:
with ZipFile(KAGGLE_TMDB_MOVIES_DATASET_PATH, 'r') as f:
    f.extractall(path=EXTRACT_MOVIES_ZIP_TO)

df_movies = pd.read_csv(EXTRACT_MOVIES_ZIP_TO / os.listdir(EXTRACT_MOVIES_ZIP_TO)[0])

In [13]:
with ZipFile(KAGGLE_TMDB_TVSHOWS_DATASET_PATH, 'r') as f:
    f.extractall(path=EXTRACT_TWSHOWS_ZIP_TO)

df_tvshows = pd.read_csv(EXTRACT_TWSHOWS_ZIP_TO / os.listdir(EXTRACT_TWSHOWS_ZIP_TO)[0])

In [26]:
df = df_movies[(df_movies['status'] == 'Released') & 
               (~df_movies['adult']) &
               (~df_movies['release_date'].isna())].copy()

df['release_year'] = df['release_date'].apply(lambda date : date[0:4]).astype(int)

df_by_title_year = df.groupby(by=['original_title', 'release_year']).id.count()

nb_duplicates_title_year = df_by_title_year[df_by_title_year > 1].shape[0]
nb_total_movies = df.shape[0]
print(f"{nb_duplicates_title_year} movies are note uniquely identify by original_title & release_year on {nb_total_movies} movies ({(100 * nb_duplicates_title_year / nb_total_movies):.2f}%)")



6497 movies are note uniquely identify by original_title & release_year on 797541 movies (0.81%)


In [6]:
df_movies_subset = df_movies.dropna(axis=0, how='any', subset=MOVIES_COLUMNS_OF_INTEREST)

df_movies_subset = df_movies_subset[(df_movies_subset['status'] == 'Released') & 
                                    (~df_movies_subset['adult']) &
                                    (df_movies_subset['release_date'] < '2024-03-01') &
                                    (df_movies_subset['original_language'].isin(['fr', 'en']))].sort_values(by='release_date', ascending=False).iloc[0:NB_MOVIES_SUBSET]
          
df_movies_subset.to_csv(EXPORT_TMDB_SUBSETS_TO / "tmdb_movies_subset.csv")

In [7]:
df_tvshows_subset = df_tvshows.dropna(axis=0, how='any', subset=TVSHOWS_COLUMNS_OF_INTEREST)

df_tvshows_subset = df_tvshows_subset[(df_tvshows_subset['status'] == 'Ended') & 
                                      (~df_tvshows_subset['adult']) &
                                      (df_tvshows_subset['last_air_date'] < '2024-03-01') &
                                      (df_tvshows_subset['original_language'].isin(['fr', 'en']))].sort_values(by='last_air_date', ascending=False).iloc[0:NB_TVSHOWS_SUBSET]
          
df_tvshows_subset.to_csv(EXPORT_TMDB_SUBSETS_TO / "tmdb_tvshows_subset.csv")