# Fetching The Actors Metadata And Images

In [7]:
! pip3 freeze > requirements_actor_fetcher.txt

In [1]:
!pip install IMDbPY
!pip install tmdbsimple
!pip install unidecode
!pip install imagehash

Collecting IMDbPY
  Downloading IMDbPY-2021.4.18-py3-none-any.whl (298 kB)
[?25l[K     |█                               | 10 kB 20.8 MB/s eta 0:00:01[K     |██▏                             | 20 kB 12.2 MB/s eta 0:00:01[K     |███▎                            | 30 kB 9.1 MB/s eta 0:00:01[K     |████▍                           | 40 kB 8.1 MB/s eta 0:00:01[K     |█████▌                          | 51 kB 4.5 MB/s eta 0:00:01[K     |██████▋                         | 61 kB 5.3 MB/s eta 0:00:01[K     |███████▊                        | 71 kB 5.5 MB/s eta 0:00:01[K     |████████▉                       | 81 kB 5.7 MB/s eta 0:00:01[K     |█████████▉                      | 92 kB 6.4 MB/s eta 0:00:01[K     |███████████                     | 102 kB 5.2 MB/s eta 0:00:01[K     |████████████                    | 112 kB 5.2 MB/s eta 0:00:01[K     |█████████████▏                  | 122 kB 5.2 MB/s eta 0:00:01[K     |██████████████▎                 | 133 kB 5.2 MB/s eta 0:00:01[

In [2]:
from PIL import Image
import imdb
import tmdbsimple as tmdb
from io import BytesIO
import requests
import pickle
from pathlib import Path
from tqdm import tqdm
import pandas as pd
from unidecode import unidecode

In [None]:
movies_dir = Path('movies')
actors_dir = Path('actors')

movies_dir.mkdir(exist_ok=True)
actors_dir.mkdir(exist_ok=True)

with Path("posters_with_dup.pkl").open("rb") as f:
  posters_with_dup = pickle.load(f)

posters_with_dup["id"] = posters_with_dup["tconst"].str.strip("tt")
movie_ids = list(set(posters_with_dup["id"]))

In [None]:
ia = imdb.IMDb()

def save_movie(movie, movies_dir):
  #save to pickle
  with open(movies_dir/f'{movie["title"]}-{movie.getID()}.pkl', 'wb') as f:
    pickle.dump(movie, f)

def save_actors_images(movie, actors_dir):
  cast = movie.get('cast')

  if cast:
      for a in cast:
        actor_id = a.getID()
        actor_dir = actors_dir/f'{a["name"]}-{actor_id}'

        actor_dir.mkdir(exist_ok=True)

        if not (actor_dir/'actor.pkl').exists():
            person = ia.get_person(actor_id)

            try:
                url = person['full-size headshot']

                if(url):
                  response = requests.get(url)
                  img = Image.open(BytesIO(response.content))
            
                  img.save(actor_dir/'imdb.jpg')
            except KeyError:
                pass
            except OSError:
                print("OSError")

            with open(f'{actor_dir}/actor.pkl', 'wb') as f:
                pickle.dump(person, f)

def get_movie_actors(movie_id):
    try:
        if len(list(movies_dir.glob(f"*{movie_id}.pkl"))) == 0:
            movie = ia.get_movie(movie_id)
            save_actors_images(movie, actors_dir)
            save_movie(movie, movies_dir)
    except imdb.IMDbDataAccessError:
        print("IMDbDataAccessError")

In [None]:
for movie_id in tqdm(movie_ids):
  get_movie_actors(movie_id)

100%|██████████| 3/3 [05:30<00:00, 110.25s/it]


In [None]:
actors_ids = [a.name.rpartition('-')[2] for a in list(actors_dir.rglob(""))[1:]]
actors_names = [a.name.rpartition('-')[0] for a in list(actors_dir.rglob(""))[1:]]
actors_df = pd.DataFrame({"imdb_id": actors_ids, 'name': actors_names})

In [None]:
# Fetch TMDB actors profile pictures

limit = 3
last_success = 0
base_url = 'https://image.tmdb.org/t/p/w780'

with Path("tmdb.txt").open() as  f:
  tmdb_key = f.read().strip()
tmdb.API_KEY = tmdb_key

for idx, row in actors_df.iterrows():
    if idx < last_success:
        continue
        f
    try:
        tmdb_id = tmdb.Find(f'nm{row["imdb_id"]}').info(external_source='imdb_id')["person_results"][0]["id"]
        people = tmdb.People(tmdb_id)
        urls = list(map(lambda x: x['file_path'], people.images()['profiles']))
        for i, url in enumerate(urls[:limit]):
            response = requests.get(f'{base_url}{url}')
            img = Image.open(BytesIO(response.content))
            img.save(f'{actors_dir}/{row["name"]}-{row["imdb_id"]}/{i}.jpg')
            print('saved', people.info()["name"], i)
    except IndexError:
        print(f'failed for {row["imdb_id"]} - {idx}')
        continue

# Create actors data frame

In [None]:
actor_images = list(actors_dir.rglob("*.jpg"))
actor_folders = [img.parent for img in actor_images]
actor_folders = list(set(actor_folders))

for folder in tqdm(actor_folders):
    new_path = unidecode(str(folder))
    if new_path != str(folder):
        folder.replace(new_path)

actor_images = list(actors_dir.rglob("*.jpg"))
actors_df = pd.DataFrame({"img_path": actor_images})

actors_df["actor_name"] = actors_df.img_path.apply(lambda x: x.parent.name.rpartition("-")[0])
actors_df["imdb_id"] = actors_df.img_path.apply(lambda x: x.parent.name.rpartition("-")[2])

100%|██████████| 172/172 [00:00<00:00, 151845.99it/s]


In [None]:
actors_df.to_pickle("actors_df.pkl")

In [None]:
# Option for concurrent run for fetching actors images

# from tqdm import tqdm_notebook as tqdm
# import concurrent

# max_workers = 8
# res = []
# movie_ids = list(df["id"])
# with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
#     for movie_id in tqdm(executor.map(get_movie_actors, movie_ids), total=len(movie_ids)):
#          pass