# Upcoming Movie Posters Dataset generation

In [1]:
import pandas as pd
import requests
import os
from dotenv import load_dotenv
import numpy as np
import tqdm
import ast
from datetime import datetime, date

## Data Related

In [2]:
root = os.path.dirname(os.path.abspath(os.getcwd()))

In [3]:
load_dotenv(os.path.join(root, "secrets.env"))

True

In [4]:
TMBD_API = os.getenv('TMDB_API')

## Get MovieDBs Genre List

In [5]:
def query_genres():
    url = f"https://api.themoviedb.org/3/genre/movie/list?api_key={TMBD_API}&language=en-US"
    req = requests.get(url)
    genres = {}
    for entry in req.json()["genres"]:
        genres[entry["id"]] = entry["name"]
    return genres

In [6]:
genres = query_genres()

In [7]:
def query_upcoming(genres: dict, page: int = 1):
    url = f"https://api.themoviedb.org/3/movie/upcoming?api_key={TMBD_API}&language=en-US&page={page}"
    req = requests.get(url)
    try:
        match req.status_code:
            case 200:
                df = pd.DataFrame.from_dict(req.json()["results"])
            case _:
                print(req.json())
    except Exception as e:
        print(f"{e}")
    return df

In [8]:
data = pd.DataFrame()

In [9]:
for i in tqdm.tqdm(range(3)):
    temp = query_upcoming(genres, i + 1)
    data = pd.concat([data, temp], ignore_index=True )

100%|██████████| 3/3 [00:00<00:00,  7.77it/s]


In [10]:
data.dropna(how="any", inplace=True)

In [11]:
data["release_date"] = pd.to_datetime(data["release_date"])

In [12]:
data

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/9n2tJBplPbgR2ca05hS5CKXwP2c.jpg,"[16, 10751, 12, 14, 35]",502356,en,The Super Mario Bros. Movie,"While working underground to fix a water main,...",2993.842,/qNBAXBIQlnOThrVvA6mA2B5ggV6.jpg,2023-04-05,The Super Mario Bros. Movie,False,7.4,1989
1,False,/fI5RsaM0NSU6TqztRhA2pal5ezv.jpg,"[28, 80, 53]",385687,en,Fast X,Over many missions and against impossible odds...,868.981,/1E5baAaEse26fej7uHcjOgEE2t2.jpg,2023-05-17,Fast X,False,5.9,12
2,False,/eEF40Xk2twM3WjRNZftfo771gjv.jpg,"[878, 12, 53]",700391,en,65,"65 million years ago, the only 2 survivors of ...",514.855,/rzRb63TldOKdKydCvWJM8B6EkPM.jpg,2023-03-02,65,False,6.3,991
3,False,/cZzZlwGvxiByXam0lZ57J9IN233.jpg,[27],1008005,es,La niña de la comunión,"Spain, late 1980s. Newcomer Sara tries to fit ...",440.717,/uYxrWr9o44yO0HvVfFFHGu01gfX.jpg,2023-02-10,The Communion Girl,False,6.1,96
4,False,/exI61quYa7xMfcIDSp674UnvrhG.jpg,"[10752, 28]",840326,fi,Sisu,"Deep in the wilderness of Lapland, Aatami Korp...",375.55,/dHx5yuBb05U9vNaNhIBD7jWyxPk.jpg,2023-01-27,Sisu,False,7.3,44
5,False,/69HgAYpbJ04w6lUUVainsYxLabb.jpg,"[14, 28, 12]",455476,en,Knights of the Zodiac,"When a headstrong street orphan, Seiya, in sea...",334.127,/1nbFEdIFprRNSWcCM0RUjQqZifF.jpg,2023-04-27,Knights of the Zodiac,False,6.5,34
6,False,/hJoMSAltRx5xvlAXvKyDdugjucJ.jpg,"[12, 10751, 14, 10749]",447277,en,The Little Mermaid,"The youngest of King Triton’s daughters, and t...",309.798,/ym1dxyOk4jFcSl4Q2zmRrA5BEEN.jpg,2023-05-18,The Little Mermaid,False,5.7,6
7,False,/vnPTLSBk95XKdahOaMkTlAck5Rc.jpg,"[28, 18, 10752]",882569,en,Guy Ritchie's The Covenant,"After an ambush, Afghan interpreter Ahmed goes...",252.812,/aX0H63vho7rZ9Rm3I567Zf00Z1t.jpg,2023-04-19,Guy Ritchie's The Covenant,False,7.8,67
8,False,/e7FzphKs5gzoghDotAEp2FeP46u.jpg,"[27, 35, 14]",649609,en,Renfield,Having grown sick and tired of his centuries a...,168.321,/p6yUjhvNGQpFZilKwOKbxQ1eHlo.jpg,2023-04-07,Renfield,False,7.2,339
9,False,/b4YwJaIBUsidiFi1ljN5c4AQcR1.jpg,"[878, 53]",942199,en,Simulant,Faye attempts to replace her newly deceased hu...,160.206,/kmDJhZIq1xnu5ZiMOlb0nHJH1qb.jpg,2023-03-29,Simulant,False,6.3,37


In [13]:
def convert_genre_ids(ids: list, genres: dict):
    genres_list = []
    for key in ids:
        genres_list.append(genres.get(key))
    return genres_list

In [14]:
data["genres"] = data["genre_ids"].apply(lambda x: convert_genre_ids(x, genres))

In [15]:
def extract_top_genre_id(x):
    return [x[0]]

In [16]:
data["genres_single"] = data["genres"].apply(lambda x: extract_top_genre_id(x))

In [17]:
data["poster_path"] = data["poster_path"].apply(lambda x: x.replace("/", ""))

In [18]:
today = pd.to_datetime('2023-04-10')


In [19]:
upcoming = data[
    data["original_language"].str.contains("en") &
    (data["release_date"] >= today)
]

In [20]:
upcoming

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,genres,genres_single
1,False,/fI5RsaM0NSU6TqztRhA2pal5ezv.jpg,"[28, 80, 53]",385687,en,Fast X,Over many missions and against impossible odds...,868.981,1E5baAaEse26fej7uHcjOgEE2t2.jpg,2023-05-17,Fast X,False,5.9,12,"[Action, Crime, Thriller]",[Action]
5,False,/69HgAYpbJ04w6lUUVainsYxLabb.jpg,"[14, 28, 12]",455476,en,Knights of the Zodiac,"When a headstrong street orphan, Seiya, in sea...",334.127,1nbFEdIFprRNSWcCM0RUjQqZifF.jpg,2023-04-27,Knights of the Zodiac,False,6.5,34,"[Fantasy, Action, Adventure]",[Fantasy]
6,False,/hJoMSAltRx5xvlAXvKyDdugjucJ.jpg,"[12, 10751, 14, 10749]",447277,en,The Little Mermaid,"The youngest of King Triton’s daughters, and t...",309.798,ym1dxyOk4jFcSl4Q2zmRrA5BEEN.jpg,2023-05-18,The Little Mermaid,False,5.7,6,"[Adventure, Family, Fantasy, Romance]",[Adventure]
7,False,/vnPTLSBk95XKdahOaMkTlAck5Rc.jpg,"[28, 18, 10752]",882569,en,Guy Ritchie's The Covenant,"After an ambush, Afghan interpreter Ahmed goes...",252.812,aX0H63vho7rZ9Rm3I567Zf00Z1t.jpg,2023-04-19,Guy Ritchie's The Covenant,False,7.8,67,"[Action, Drama, War]",[Action]
12,False,/9xfDWXAUbFXQK585JvByT5pEAhe.jpg,"[16, 28, 12, 878]",569094,en,Spider-Man: Across the Spider-Verse,"After reuniting with Gwen Stacy, Miles Morales...",119.826,zPoqAu4gxZRmcPzSLFJ9b0VciaL.jpg,2023-05-31,Spider-Man: Across the Spider-Verse,False,0.0,0,"[Animation, Action, Adventure, Science Fiction]",[Animation]
18,False,/igXrblWrU1uaC09VKyquHHSebr.jpg,"[28, 53]",717930,en,Kandahar,"After his mission is exposed, an undercover CI...",69.786,gVh7d9n9WtUS6VSEaRXpGSTyhHW.jpg,2023-05-25,Kandahar,False,0.0,0,"[Action, Thriller]",[Action]
21,False,/tbQ7bEg5qnk7zR1bsi4xMqnaIRu.jpg,"[80, 9648, 53]",536437,en,Hypnotic,A detective becomes entangled in a mystery inv...,54.847,mv362kHXz4Yr33wtQhqfcJWFM8G.jpg,2023-05-11,Hypnotic,False,6.0,1,"[Crime, Mystery, Thriller]",[Crime]
22,False,/rfhrG37ikF5pOm9pJTbgfIlGK5g.jpg,"[10749, 18, 35]",758336,en,Love Again,"Mira Ray, dealing with the loss of her fiancé,...",54.415,usfDx1g1eN9eZ8kxfrbLRkKVjit.jpg,2023-05-04,Love Again,False,7.0,4,"[Romance, Drama, Comedy]",[Romance]
23,False,/gGZxCy9keK0D6TxbmS8GJj85Ut4.jpg,"[80, 35]",809787,en,Mafia Mamma,A suburban American woman inherits her grandfa...,52.446,zi0Lrmtvhrz52VXUjwEKlXItF9q.jpg,2023-04-13,Mafia Mamma,False,7.2,36,"[Crime, Comedy]",[Crime]
25,False,/pS4oSxn9g0PAzkWD5zrSOuRIgT6.jpg,"[35, 12, 14]",798286,en,Beau Is Afraid,A paranoid man embarks on an epic odyssey to g...,50.792,wgVkkjigF31r1nZV80uV0xNIoun.jpg,2023-04-14,Beau Is Afraid,False,7.3,196,"[Comedy, Adventure, Fantasy]",[Comedy]


In [21]:
upcoming = upcoming.reset_index(drop=True)

In [22]:
upcoming_csv = os.path.join(root, "data", "upcoming.csv")

In [23]:
upcoming.to_csv(upcoming_csv)

In [24]:
image_url = "https://image.tmdb.org/t/p/original/"

In [25]:
def download_poster(poster_path):
    to_download = f"{image_url}{poster_path}"
    img_data = requests.get(to_download).content
    with open(os.path.join(root, "images", "raw", poster_path), 'wb') as handler:
        handler.write(img_data) 

In [26]:
# roughly ~30 minutes for full download
for index, row in tqdm.tqdm(upcoming.iterrows(), total=upcoming.shape[0]):
    download_poster(row["poster_path"])

100%|██████████| 22/22 [00:05<00:00,  4.14it/s]
