In [None]:
# first, we need to download the original data from grouplens
!wget https://files.grouplens.org/datasets/movielens/ml-20m.zip

# next, we unzip the downloaded data
!unzip ml-20m.zip

# then, we download the video links from grouplens
!wget https://files.grouplens.org/datasets/movielens/ml-20m-youtube.zip

# and unzip them as well
!unzip ml-20m-youtube.zip


--2026-01-24 17:19:59--  https://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.96.204
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: ‘ml-20m.zip’


2026-01-24 17:20:11 (16.9 MB/s) - ‘ml-20m.zip’ saved [198702078/198702078]



In [9]:
# now we load the ratings, the metadata, and the links for the items in ML-20M
import pandas as pd 

ratings = pd.read_csv('ml-20m/ratings.csv')
links = pd.read_csv('ml-20m/links.csv')
videos = pd.read_csv('ml-youtube.csv')

In [10]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [11]:
links.head(5)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [12]:
videos.head(5)

Unnamed: 0,youtubeId,movieId,title
0,K26_sDKnvMU,1,Toy Story (1995)
1,3LPANjHlPxo,2,Jumanji (1995)
2,rEnOoWs3FuA,3,Grumpier Old Men (1995)
3,j9xml1CxgXI,4,Waiting to Exhale (1995)
4,ltwvKLnj1B4,5,Father of the Bride Part II (1995)


In [13]:
# we join metadata and links dataframe into a unique metadata dataframe
meta = videos.set_index('movieId').join(links.set_index('movieId'), on='movieId').reset_index()

In [14]:
meta.head(5)

Unnamed: 0,movieId,youtubeId,title,imdbId,tmdbId
0,1,K26_sDKnvMU,Toy Story (1995),114709,862.0
1,2,3LPANjHlPxo,Jumanji (1995),113497,8844.0
2,3,rEnOoWs3FuA,Grumpier Old Men (1995),113228,15602.0
3,4,j9xml1CxgXI,Waiting to Exhale (1995),114885,31357.0
4,5,ltwvKLnj1B4,Father of the Bride Part II (1995),113041,11862.0


In [16]:
# some movies have no a TMDB ID
meta[meta['tmdbId'].isna()]

Unnamed: 0,movieId,youtubeId,title,imdbId,tmdbId
140,142,l25067DO9mo,Shadows (Cienie) (1988),94878,
699,720,1neY_Zh5mIs,Wallace & Gromit: The Best of Aardman Animatio...,118114,
708,730,7AG6yN9JP5g,Low Life (1994),125877,
744,770,cwH5h4htOFM,Costa Brava (1946),38426,
787,821,fVlKT6UT81c,"Crude Oasis, The (1995)",112746,
...,...,...,...,...,...
22813,114963,vuld5yCpynU,Fara (1999),322250,
22824,115038,19WqFL5YREg,"Silent One, The (1985)",88118,
22871,115254,_zNQn9wbyfI,Charlie Chan Carries On (1931),21733,
22960,115715,VfQ8f3FtpsQ,Learning to Ride (2014),3670792,


## Step 1: TMDB query

In [22]:
# useful imports

import requests
from dotenv import load_dotenv
import json
import time
from tqdm import tqdm
import pandas as pd 
import os
import random as rand

In [None]:
# load the TMDB API key stored in .env file
# TMDB_API_KEY=<your_api>

load_dotenv()

# --- CONFIGURATION ---
TMDB_API_KEY = os.getenv("TMDB_API_KEY")
TMDB_BASE_URL = "https://api.themoviedb.org/3/movie/"
IMAGE_BASE_URL = "https://image.tmdb.org/t/p/original"

In [20]:
# function we use to collect data from TMDB querying
def get_movie_data_from_tmdb(tmdb_id: int):
    """Fetches basic movie info, poster, and trailer/video from TMDb."""
    # Get movie details
    details = requests.get(
        f"{TMDB_BASE_URL}{tmdb_id}",
        params={"api_key": TMDB_API_KEY, "language": "en-US"}
    ).json()
    
    # Get videos (trailers, teasers, clips, etc.)
    videos = requests.get(
        f"{TMDB_BASE_URL}{tmdb_id}/videos",
        params={"api_key": TMDB_API_KEY, "language": "en-US"}
    ).json()

    trailer_url = None
    if "results" in videos:
        for v in videos["results"]:
            site = v.get("site", "").lower()
            key = v.get("key")
            video_type = v.get("type", "").lower()
            
            if video_type in ["trailer", "teaser", "clip"] and key:
                # Build a valid video URL depending on platform
                if site == "youtube":
                    trailer_url = f"https://www.youtube.com/watch?v={key}"
                elif site == "vimeo":
                    trailer_url = f"https://vimeo.com/{key}"
                elif site == "dailymotion":
                    trailer_url = f"https://www.dailymotion.com/video/{key}"
                else:
                    # fallback if TMDb provides a full key or unknown site
                    trailer_url = key if key.startswith("http") else None
                if trailer_url:
                    break  # take the first available valid video

    return {
        "title": details.get("title"),
        "overview": details.get("overview"),
        "poster": IMAGE_BASE_URL + details["poster_path"] if details.get("poster_path") else None,
        "trailer": trailer_url
    }


In [None]:
# loop: for each movie, we query TMDB and get:
# - movie plot (plain text)
# - movie poster URL
# - movie trailer URL

# directory in which we will store all the data
os.makedirs('tmdb_data', exist_ok=True)

# loop start
for i, row in tqdm(meta.iterrows(), total=len(meta)):

    sleep = False
    movie_id = row['movieId']

    # if os.path.exists(f'tmdb_data/{movie_id}.txt'):

    #     try:
    #         movie_dict = json.load(open(f'tmdb_data/{movie_id}.txt', 'r', encoding='utf-8'))
    #         if 'overview' in movie_dict:
    #             continue
    #     except:
    #         print(f'retry {movie_id}')
    
    # dict we use to store such data
    movie_info = dict()

    try:

        # query TMDB and try getting info
        tmdb_id = int(row['tmdbId'])
        movie_info = get_movie_data_from_tmdb(tmdb_id)
        sleep = True
        movie_info['movieId'] = movie_id
        movie_info['tmdbId'] = tmdb_id
        
    except Exception as e:
        print(f'{movie_id}:\t{str(e)}')
        movie_info['movieId'] = str(e)
        continue

    # after querying, save info and set a timeout of 1s + random ms
    with open(f'tmdb_data/{movie_id}.txt', 'w') as fout:
        json.dump(movie_info, fout)
        wait = 1 + rand.random()
        
    if sleep:
        time.sleep(wait)

    # we break the loop for this tutorial
    break

  0%|          | 0/25623 [00:02<?, ?it/s]


In [24]:
# let's what has been downloaded
print(os.listdir('tmdb_data/'))

['1.txt']


In [26]:
# let's load 1.txt
with open('tmdb_data/1.txt', 'r') as fin:
    movie_data = json.load(fin)

In [27]:
movie_data

{'title': 'Toy Story',
 'overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
 'poster': 'https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg',
 'trailer': 'https://www.youtube.com/watch?v=CxwTLktovTU',
 'movieId': 1,
 'tmdbId': 862}

In [None]:
# now we have, for Toy Story:
# - plot saved in overview
# - movie poster URL
# - movie trailer URL

# let's download this raw data (poster and movie)

## Step 2: Download multimodal raw data

In [43]:
# function used to download movie posters
def download_poster(item_id, url, destination_folder):

    if '.png' in url:
        extension = '.png'
    elif '.jpg' in url:
        extension = '.jpg'
    else:
        extension = ''
    
    # we handle multiple extension
    # if our cases, only .png was used
    filename = f'{destination_folder}/{item_id}{extension}'
    response = requests.get(url)

    with open(filename, "wb") as file:
        file.write(response.content)

# # function used to creat a reasonable delay
# def wait_delay():
#     base = rand.uniform(5-8)      # 5-8 seconds
#     jitter = rand.uniform(-0.5, 0.5) # little jitter
#     time.sleep(base + jitter)

In [44]:
# loop over all downloaded TMDB data
# (only 1 in this tutorial)

# get all downloaded files with movie information
json_files = [f'tmdb_data/{x}' for x in os.listdir('tmdb_data/') if '.txt' in x]

# posters will be saved in the 'poster/' directory
os.makedirs('poster', exist_ok=True)

# loop start
for json_file in tqdm(json_files, total=len(json_files), dynamic_ncols=True):

    # get dict
    movie_data = json.load(open(json_file, 'r'))
    
    if 'poster' in movie_data:

        # check poster exists
        if movie_data['poster'] is None:
            continue

        # get poster and id
        poster_url = movie_data['poster']
        movie_id = int(movie_data['movieId'])

        # skip existing files
        if os.path.exists(f'poster/{movie_id}.jpg') or \
            os.path.exists(f'poster/{movie_id}.png') or  \
            os.path.exists(f'poster/{movie_id}'):
            continue

        # download_poster(movie_id, poster_url, 'poster')

        # download the trailer and sleep, or log the error 
        try:
            # set timeout
            time.sleep(1)
            print(f'Downloading poster from {poster_url}... ', end='')
            download_poster(movie_id, poster_url, 'poster')
            print('Downloaded')

        except Exception as e:
            print(f'Error during poster downloading: {e}')




  0%|          | 0/1 [00:00<?, ?it/s]

Downloading poster from https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg... 

100%|██████████| 1/1 [00:01<00:00,  1.45s/it]

Downloaded





In [None]:
# to download video trailer, we used yt_dlp
!pip install yt_dlp

In [46]:
# import the library to download videos
import yt_dlp

# function that downloads the movie trailer
def download_trailer(item_id, url, destination_folder):

    # destination file
    filename = f'{destination_folder}/{item_id}'

    # headers useful for the 
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/127.0.0.0 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Dest": "document",
    }

    # download options
    # cookiefile is suggested, but not mandatory
    ydl_opts = {
        'outtmpl': filename,
        'format': 'bestvideo+bestaudio/best',
        'merge_output_format': 'mp4',
        'quiet': True,
        # 'cookiefile': 'cookies.txt',
        'http_headers': headers,
        'nopart': True,
        'noprogress': True,
        'concurrent_fragment_downloads': 1,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

In [47]:
# loop over all downloaded TMDB data
# (only 1 in this tutorial)

# get all downloaded files with movie information
json_files = [f'tmdb_data/{x}' for x in os.listdir('tmdb_data/') if '.txt' in x]

# trailers will be saved in the 'trailer/' directory
os.makedirs('trailer', exist_ok=True)

# loop start
for json_file in tqdm(json_files, total=len(json_files), dynamic_ncols=True):

    # get dict
    movie_data = json.load(open(json_file, 'r'))
    
    if 'trailer' in movie_data:

        # check poster exists
        if movie_data['trailer'] is None:
            continue

        # get trailer and id
        trailer_url = movie_data['trailer']
        movie_id = int(movie_data['movieId'])

        # skip existing files
        if os.path.exists(f'trailer/{movie_id}.mkv') or \
            os.path.exists(f'trailer/{movie_id}.mp4') or  \
            os.path.exists(f'trailer/{movie_id}.webm') or \
            os.path.exists(f'trailer/{movie_id}'):
            continue

        # download the trailer and sleep, or log the error 
        try:

            # set timeout
            time.sleep(1)
            print(f'Downloading trailer from {trailer_url}... ', end='')
            download_trailer(movie_id, trailer_url, 'poster')
            print('Downloaded')

        except Exception as e:
            print(f'Error during trailer downloading: {e}')




  0%|          | 0/1 [00:00<?, ?it/s]

Downloading trailer from https://www.youtube.com/watch?v=CxwTLktovTU... 

         player = https://www.youtube.com/s/player/c9168c90/player_es6.vflset/en_US/base.js
         n = dbv7sjkU7uNyfDzJ ; player = https://www.youtube.com/s/player/c9168c90/player_es6.vflset/en_US/base.js
100%|██████████| 1/1 [01:33<00:00, 93.14s/it]

Downloaded





### Done!

By looping on all the TMDB data we collect, it is possible to download all multimodal raw files.

This tutorial served to show how to do this, and how we did this. 