# Movie Posters Dataset generation

In [1]:
import pandas as pd
import requests
import os
from dotenv import load_dotenv
import numpy as np
import tqdm
import ast

## Data Related

In [2]:
root = os.path.dirname(os.path.abspath(os.getcwd()))

In [3]:
load_dotenv(os.path.join(root, "secrets.env"))

True

In [4]:
TMBD_API = os.getenv('TMDB_API')

## Get MovieDBs Genre List

In [5]:
def query_genres():
    url = f"https://api.themoviedb.org/3/genre/movie/list?api_key={TMBD_API}&language=en-US"
    req = requests.get(url)
    genres = {}
    for entry in req.json()["genres"]:
        genres[entry["id"]] = entry["name"]
    return genres

In [6]:
genres = query_genres()

In [7]:
genres

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

## Get MovieDBs Movie Top 500 Page Results 

In [8]:
MAX_PAGE_RESULTS = 500

In [9]:
def query_popular(genres: dict, page: int = 1):
    url = f"https://api.themoviedb.org/3/movie/popular?api_key={TMBD_API}&language=en-US&page={page}"
    req = requests.get(url)
    try:
        match req.status_code:
            case 200:
                df = pd.DataFrame.from_dict(req.json()["results"])
            case _:
                print(req.json())
    except Exception as e:
        print(f"{e}")
    return df

In [10]:
data = pd.DataFrame()

In [11]:
for i in tqdm.tqdm(range(MAX_PAGE_RESULTS)):
    temp = query_popular(genres, i + 1)
    data = pd.concat([data, temp], ignore_index=True )

100%|██████████| 500/500 [01:01<00:00,  8.19it/s]


In [12]:
data

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/9n2tJBplPbgR2ca05hS5CKXwP2c.jpg,"[16, 12, 10751, 14, 35]",502356,en,The Super Mario Bros. Movie,"While working underground to fix a water main,...",10058.912,/qNBAXBIQlnOThrVvA6mA2B5ggV6.jpg,2023-04-05,The Super Mario Bros. Movie,False,7.5,410
1,False,/5i6SjyDbDWqyun8klUuCxrlFbyw.jpg,"[18, 28]",677179,en,Creed III,"After dominating the boxing world, Adonis Cree...",7413.386,/vJU3rXSP9hwUuLeq8IpfsJShLOk.jpg,2023-03-01,Creed III,False,7.3,870
2,False,/ovM06PdF3M8wvKb06i4sjW3xoww.jpg,"[878, 12, 28]",76600,en,Avatar: The Way of Water,Set more than a decade after the events of the...,6686.292,/t6HIqrRAclMCA60NsSmeqe9RmNV.jpg,2022-12-14,Avatar: The Way of Water,False,7.7,6873
3,False,/vSUls0b7dNhC7tJoExF1MBYWWyh.jpg,"[16, 35, 10751, 12, 14]",816904,es,Momias,"Through a series of unfortunate events, three ...",3562.319,/qVdrYN8qu7xUtsdEFeGiIVIaYd.jpg,2023-01-05,Mummies,False,7.1,125
4,False,/bT3IpP7OopgiVuy6HCPOWLuaFAd.jpg,"[35, 9648, 28]",638974,en,Murder Mystery 2,"After starting their own detective agency, Nic...",4873.950,/swzMoIVn6xjB857ziYJ8KBV440g.jpg,2023-03-28,Murder Mystery 2,False,6.4,564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,False,/3WKrovKHfJTJzYOqpYtCftr9iDX.jpg,[35],121342,it,Giovannona Coscialunga disonorata con onore,When a judge shuts down a high profile cheese ...,14.619,/vkF8VLrazGtk9OjdEhihG6kKAhP.jpg,1973-04-12,Giovannona Long-Thigh,False,4.9,31
9996,False,/vzjgieFzT8ySkMdxdu3kIf4caZS.jpg,[53],11983,en,Proof of Life,Alice hires a professional negotiator to obtai...,13.690,/yw8x2i3vaHZZzpvqvF75E8q2N6M.jpg,2000-12-08,Proof of Life,False,6.1,682
9997,False,/tdgce7OwwUwF7atRiar1y1AS6eh.jpg,"[18, 28, 35, 53, 80]",26715,en,Bitch Slap,"Three bad girls (a down-and-out stripper, a dr...",15.441,/bFOmE3zCFU01TuomOOwClAWdvOD.jpg,2009-05-16,Bitch Slap,False,4.7,252
9998,False,/6ZNqWfUXdxiNVqwgO2doLlkmgQc.jpg,"[28, 80, 18, 53]",139567,en,Fire with Fire,A fireman takes an unexpected course of action...,15.442,/kziBJGQFo9f0Vkj9s37qI0G9I0I.jpg,2012-08-31,Fire with Fire,False,5.7,600


## check if any genre id columns are empty

In [37]:
data[data["genre_ids"].astype(bool) == False]


Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,genres
1605,False,/4hPKEU0DfkPSGgDRg0mBje5BfJ0.jpg,[],185708,ja,団鬼六　美女縄化粧,Miki is the daughter of an affluent family. Sh...,57.925,/kuIf5deP5PsasS4rdaC6eA4AyDW.jpg,1983-12-02,Beauty Rope Cosmetology,False,1.0,1,[]
2121,False,/ek4BlZ4ZCdVannkLz7VYjvWDKZb.jpg,[],887074,zh,屠魔：王者征途,Lord of Darkness Artha was born from human str...,37.755,/174uwJ0uhlWHbp8FPZRth7SMv6d.jpg,2021-10-18,The Sword,False,5.3,3,[]
2618,False,/fsBnJV0K5bcmzUAwL47A9FqoKae.jpg,[],105812,cn,青樓十二房,Madam Five and carpenter Kong work together ma...,36.173,/53AdAWdL7pJnWzvfSfDq30MJaFw.jpg,1994-09-15,Ancient Chinese Whorehouse,False,5.0,1,[]
2980,False,/hhDGtayED4HqhTrDRO4yqz9MVGG.jpg,[],687558,ko,여자전쟁:도기의난,There is a visitor for local herb gatherer Dog...,29.904,/pUnZK0uumzFD068kdcHMBTj9YQq.jpg,2016-03-24,Female War: Doggie's Uprising,False,0.0,0,[]
3939,False,/klmGn6q0fQNfBFNLDK4FY59pv8l.jpg,[],902870,zh,狙击之王,The Salvation Society is the most powerful cri...,18.311,/ecIRwkWHuAA830HlO7u5ofRp4uM.jpg,2021-11-19,The Sniper,False,4.0,2,[]
4456,False,/2F4SaH5pgD0zEzdZD8Dx1LMqptP.jpg,[],592742,ja,私の奴隷になりなさい 第2章 「ご主人様と呼ばせてください」,"Meguro, who works for the advertising design c...",17.081,/isxWYeKRNaJ84hN46CCPyNtxXuW.jpg,2018-09-29,Be My Master,False,6.9,7,[]
5255,False,/rBHk5eFF4Flc5dOriPwBqp03qmQ.jpg,[],658494,ja,ホームジャック　トライアングル,A female police officer arrests serial killer/...,20.76,/v9SV2VwBwg401wUZkfq8F2EAMWR.jpg,2018-03-02,Homejack Triangle,False,6.5,2,[]
5502,False,/vFYgAENLOeD2sHCAEQfx5rDPjRt.jpg,[],312750,ml,Rathinirvedam,The story takes place in a small village surro...,22.206,/sTkMzvqmCYAYVqZxJhlxubvBYJl.jpg,1978-03-08,Rathinirvedam,False,9.5,2,[]
5522,False,/llLsvucZRrpjTMpz7Fczp7vZut4.jpg,[],449302,zh,嬌妻四艷鬼,Some female ghosts seduce and kill trespassers...,31.049,/nODjK3FePbOQvQrLxInHoCIqAEA.jpg,1994-03-26,Snake Beauty,False,0.0,0,[]
5547,False,/pAEBusvtIgzQ5eTgV54PgN0a3qp.jpg,[],729791,ja,美女奉行　おんな牢秘抄,A woman calling herself Princess Oryu enters t...,22.104,/5rUqxpyE3YbnBK721euV0IkbNq3.jpg,1995-03-24,Musume Bugyo On-na Ro Hisho,False,2.5,2,[]


In [45]:
data = data[data["genre_ids"].astype(bool)]


## Data Cleaning

In [46]:
data.isnull().sum(axis = 0)

adult                0
backdrop_path        0
genre_ids            0
id                   0
original_language    0
original_title       0
overview             0
popularity           0
poster_path          0
release_date         0
title                0
video                0
vote_average         0
vote_count           0
genres               0
dtype: int64

In [47]:
# drop rows that contain null values
data.dropna(inplace=True)

In [48]:
data.shape

(9739, 15)

In [49]:
# convert list of genre ids to string
# [53, 35, 80] -> ["Thriller", "Comedy", "Crime"]

In [50]:
def convert_genre_ids(ids: list, genres: dict):
    genres_list = []
    for key in ids:
        genres_list.append(genres.get(key))
    return genres_list

In [51]:
data["genres"] = data["genre_ids"].apply(lambda x: convert_genre_ids(x, genres))

In [52]:
# drop rows that contain null values
data.dropna(inplace=True)

In [67]:
def extract_top_genre_id(x):
    return [x[0]]

In [68]:
# convert list of genre names to top genre
# ["Thriller", "Comedy", "Crime"] -> ["Thriller"]

In [69]:
data["genres_single"] = data["genres"].apply(lambda x: extract_top_genre_id(x))

In [70]:
data.head(5)

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,genres,genres_single
0,False,/9n2tJBplPbgR2ca05hS5CKXwP2c.jpg,"[16, 12, 10751, 14, 35]",502356,en,The Super Mario Bros. Movie,"While working underground to fix a water main,...",10058.912,qNBAXBIQlnOThrVvA6mA2B5ggV6.jpg,2023-04-05,The Super Mario Bros. Movie,False,7.5,410,"[Animation, Adventure, Family, Fantasy, Comedy]",[Animation]
1,False,/5i6SjyDbDWqyun8klUuCxrlFbyw.jpg,"[18, 28]",677179,en,Creed III,"After dominating the boxing world, Adonis Cree...",7413.386,vJU3rXSP9hwUuLeq8IpfsJShLOk.jpg,2023-03-01,Creed III,False,7.3,870,"[Drama, Action]",[Drama]
2,False,/ovM06PdF3M8wvKb06i4sjW3xoww.jpg,"[878, 12, 28]",76600,en,Avatar: The Way of Water,Set more than a decade after the events of the...,6686.292,t6HIqrRAclMCA60NsSmeqe9RmNV.jpg,2022-12-14,Avatar: The Way of Water,False,7.7,6873,"[Science Fiction, Adventure, Action]",[Science Fiction]
3,False,/vSUls0b7dNhC7tJoExF1MBYWWyh.jpg,"[16, 35, 10751, 12, 14]",816904,es,Momias,"Through a series of unfortunate events, three ...",3562.319,qVdrYN8qu7xUtsdEFeGiIVIaYd.jpg,2023-01-05,Mummies,False,7.1,125,"[Animation, Comedy, Family, Adventure, Fantasy]",[Animation]
4,False,/bT3IpP7OopgiVuy6HCPOWLuaFAd.jpg,"[35, 9648, 28]",638974,en,Murder Mystery 2,"After starting their own detective agency, Nic...",4873.95,swzMoIVn6xjB857ziYJ8KBV440g.jpg,2023-03-28,Murder Mystery 2,False,6.4,564,"[Comedy, Mystery, Action]",[Comedy]


In [71]:
# remove leading '/' from poster_path
data["poster_path"] = data["poster_path"].apply(lambda x: x.replace("/", ""))

## Save Raw CSV

In [72]:
data = data.reset_index(drop=True)

In [73]:
data_csv = os.path.join(root, "data", "raw_data.csv")

In [74]:
data.to_csv(data_csv)