# Fetching The Posters Metadata And Images

In [1]:
!pip install IMDbPY
!pip install tmdbsimple

Collecting IMDbPY
  Downloading IMDbPY-2021.4.18-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 8.5 MB/s eta 0:00:01
Installing collected packages: IMDbPY
Successfully installed IMDbPY-2021.4.18
Collecting tmdbsimple
  Downloading tmdbsimple-2.9.1-py3-none-any.whl (38 kB)
Installing collected packages: tmdbsimple
Successfully installed tmdbsimple-2.9.1


In [2]:
import json
import os
from io import BytesIO
from pathlib import Path

import pandas as pd
import requests
from PIL import Image
from tqdm import tqdm_notebook as tqdm
import numpy as np

import imdb
import tmdbsimple as tmdb

In [None]:
posters_dir = Path('posters')
posters_dir.mkdir()

In [None]:
def save_img(dir_path, save_path, url):
  response = requests.get(f'{url}')
  img = Image.open(BytesIO(response.content))

  if not os.path.exists(dir_path):
    os.makedirs(dir_path)

  img.save(f'{dir_path}/{save_path}')

## Load latest IMDb data:

In [None]:
basics = pd.read_csv('https://datasets.imdbws.com/title.basics.tsv.gz', sep='\t', header=0, na_values="\\N") # titles from IMDb
ratings = pd.read_csv('https://datasets.imdbws.com/title.ratings.tsv.gz', sep='\t', header=0) # ratings from IMDb

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
basics = basics[basics["titleType"] == 'movie']
basics = basics[basics["originalTitle"]==basics["primaryTitle"]]

In [None]:
data = pd.merge(ratings, basics, on='tconst').dropna(subset=["startYear"])
reduced = data[data["numVotes"] > 1000].sort_values("averageRating", ascending=False) 
reduced.genres = reduced.genres.str.split(",")
reduced["genres"] = reduced["genres"].apply(lambda x: [] if  x is np.nan else x)
reduced = reduced[reduced["genres"].apply(lambda x: "Animation" not in x)]

reduced['tconst'].to_csv('reduced-ids.csv', index=False)

In [None]:
# sample - you should comment those lines for full data run
sample_ids = ['tt0133093', 'tt1392170', 'tt1431045']
reduced = reduced[reduced['tconst'].isin(sample_ids)]

In [None]:
tmdb_key = '' # your tmdb api key

with Path("tmdb.txt").open() as  f:
  tmdb_key = f.read().strip()

## Download Poster Images

In [None]:
imdbPy = imdb.IMDb()
base_tmdb_img_url = 'https://image.tmdb.org/t/p/w780'
tmdb.API_KEY = tmdb_key

try: 
  with Path('curr_id.txt').open("r") as f:
      curr = int(f.read()) # In case the program shuts down before finishing everything

except FileNotFoundError: 
  curr = 0 

for i, imdb_id in enumerate(tqdm(reduced['tconst'][curr:])):
    try:
        number_imdb_id = imdb_id.partition('tt')[2]
        movie = imdbPy.get_movie(number_imdb_id)
        dir_path = posters_dir/imdb_id

        # Save images from IMDB
        if "full-size cover url" in movie.keys():
            save_img(dir_path, f'{imdb_id}-imdbpy.jpg', movie["full-size cover url"])
        else:
            print(f'{imdb_id} does not have a poster')

        # Save images from TMDB + images metadata
        imgs_metadata = tmdb.Movies(imdb_id).images()['posters']

        for img in imgs_metadata:
            save_img(dir_path, f'{img["file_path"][1:]}', f'{base_tmdb_img_url}{img["file_path"]}')

        with (dir_path/'metadata.json').open("w") as f:
            json.dump(imgs_metadata, f)

        with (dir_path/'movie_name.txt').open("w") as f:
            f.write(f"{movie.get('title')}")

    except UnicodeEncodeError:
        print("UnicodeEncodeError")
    except OSError:
        print("OSError")
    except imdb.IMDbDataAccessError:
        print("IMDbDataAccessError")
    except KeyError:
        print("KeyError")
    finally:
        with Path('curr_id.txt').open("w") as f:
            f.write(f"{curr+i}")



In [None]:
# Create posters dataframe from collected metadata
movie_posters = []
for movie_dir in tqdm(posters_dir.iterdir()):
  try:
    with (movie_dir/"metadata.json").open("r") as f:
      movie_metadata = json.load(f)
      df = pd.DataFrame(movie_metadata)
      title = (movie_dir/"movie_name.txt").open().read()
      df["movie"] = title
      df["tconst"] = movie_dir.name
      movie_posters.append(df)
  except FileNotFoundError:
      print(movie_dir.name)

movie_posters = pd.concat(movie_posters)
movie_posters = movie_posters.reset_index(drop=True)
movie_posters = movie_posters.merge(reduced, on='tconst')

movie_posters.to_pickle("movie_posters.pkl")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


0it [00:00, ?it/s]

In [None]:
tmdb.API_KEY = tmdb_key
tmdb_data = []
for i, imdb_id in enumerate(tqdm(reduced['tconst'][curr:])):
  try:
      number_id = imdb_id.partition('tt')[2]

      tmdb_movie = tmdb.Movies(imdb_id)
      tmdb_info = tmdb_movie.info()
      tmdb_data.append(tmdb_info)
  except requests.HTTPError:
    pass

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
tmdb_df = pd.DataFrame(tmdb_data)
tmdb_df.to_pickle("tmdb_data.pkl")