We use the imdb api (see [documentation](https://imdbapi.dev/))in order to retrieve some basic informations about the titles represented in the time series dataset.

In [1]:
import requests
import csv
from pathlib import Path
from itertools import batched
from tqdm import tqdm

In [2]:
data_dir = Path("../../data")
results_dir = Path("../../results")
data_path = data_dir / 'imdb_ts.csv'

In [3]:
base_url = 'https://api.imdbapi.dev/'
endpoint = "titles/"

In [4]:
title_id = 'tt0062622'

## Example with one record

In [5]:
response = requests.get(base_url + endpoint + title_id)
response.status_code

200

In [6]:
data = response.json()
data

{'id': 'tt0062622',
 'type': 'movie',
 'primaryTitle': '2001: A Space Odyssey',
 'primaryImage': {'url': 'https://m.media-amazon.com/images/M/MV5BNjU0NDFkMTQtZWY5OS00MmZhLTg3Y2QtZmJhMzMzMWYyYjc2XkEyXkFqcGc@._V1_.jpg',
  'width': 1380,
  'height': 2044},
 'startYear': 1968,
 'runtimeSeconds': 8940,
 'genres': ['Adventure', 'Sci-Fi'],
 'rating': {'aggregateRating': 8.3, 'voteCount': 758947},
 'metacritic': {'score': 84, 'reviewCount': 25},
 'plot': 'When a mysterious artifact is uncovered on the Moon, a spacecraft manned by two humans and one supercomputer is sent to Jupiter to find its origins.',
 'originCountries': [{'code': 'GB', 'name': 'United Kingdom'},
  {'code': 'US', 'name': 'United States'}],
 'spokenLanguages': [{'code': 'eng', 'name': 'English'},
  {'code': 'rus', 'name': 'Russian'},
  {'code': 'fra', 'name': 'French'}]}

In [7]:
movie = dict(
    id=title_id,
    title=data.get('primaryTitle'),
    startYear=data.get('startYear'),
    runtimeMinutes=round(data.get('runtimeSeconds') / 60),
    numVotes=data.get('rating', {}).get('voteCount'),
    countryOfOrigin=','.join(country.get('code', '') for country in data.get('originCountries', []))
)

In [8]:
movie

{'id': 'tt0062622',
 'title': '2001: A Space Odyssey',
 'startYear': 1968,
 'runtimeMinutes': 149,
 'numVotes': 758947,
 'countryOfOrigin': 'GB,US'}

It is also possible to get multiple (up to 10) records at the same time.

In [9]:
ids = ['tt0062622','tt0064816']
response = requests.get(base_url + 'titles:batchGet?titleIds=' + '&titleIds='.join(ids))

In [10]:
response.status_code
response.json()

{'titles': [{'id': 'tt0062622',
   'type': 'movie',
   'primaryTitle': '2001: A Space Odyssey',
   'primaryImage': {'url': 'https://m.media-amazon.com/images/M/MV5BNjU0NDFkMTQtZWY5OS00MmZhLTg3Y2QtZmJhMzMzMWYyYjc2XkEyXkFqcGc@._V1_.jpg',
    'width': 1380,
    'height': 2044},
   'startYear': 1968,
   'runtimeSeconds': 8940,
   'genres': ['Adventure', 'Sci-Fi'],
   'rating': {'aggregateRating': 8.3, 'voteCount': 758947},
   'metacritic': {'score': 84, 'reviewCount': 25},
   'plot': 'When a mysterious artifact is uncovered on the Moon, a spacecraft manned by two humans and one supercomputer is sent to Jupiter to find its origins.',
   'originCountries': [{'code': 'GB', 'name': 'United Kingdom'},
    {'code': 'US', 'name': 'United States'}],
   'spokenLanguages': [{'code': 'eng', 'name': 'English'},
    {'code': 'rus', 'name': 'Russian'},
    {'code': 'fra', 'name': 'French'}]},
  {'id': 'tt0064816',
   'type': 'movie',
   'primaryTitle': 'The Swimming Pool',
   'originalTitle': 'La piscin

Also we query for informations relative to awards and nominations. In this case we only check if the json of the response has some content in order to determine if the title was awarded nominations or awards.

In [11]:
response = requests.get(base_url + endpoint + title_id + '/awardNominations')
response.status_code

200

In [12]:
response.json()

{'stats': {'nominationCount': 14, 'winCount': 18},
 'awardNominations': [{'nominees': [{'id': 'nm0706937',
     'displayName': 'Douglas Rain',
     'primaryImage': {'url': 'https://m.media-amazon.com/images/M/MV5BOGRlNDUxNDAtY2YwMS00N2YyLTk4OGEtZGZjZDJjMzg3Y2NhXkEyXkFqcGc@._V1_.jpg',
      'width': 658,
      'height': 819},
     'birthDate': {'year': 1928, 'month': 5, 'day': 9},
     'deathDate': {'year': 2018, 'month': 11, 'day': 11}}],
   'event': {'id': 'ev0002704',
    'name': 'Online Film & Television Association'},
   'year': 2024,
   'text': 'OFTA Film Hall of Fame',
   'category': 'Character',
   'isWinner': True,
   'winnerRank': 1},
  {'event': {'id': 'ev0000004',
    'name': 'Academy of Science Fiction, Fantasy & Horror Films, USA'},
   'year': 2024,
   'text': 'Saturn Award',
   'category': 'Best Film Home Media Collection Release'},
  {'nominees': [{'id': 'nm0000040',
     'displayName': 'Stanley Kubrick',
     'primaryImage': {'url': 'https://m.media-amazon.com/images/M/

## And now the rest

In [13]:
title_ids = []

with open(data_path) as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        title_ids.append(row.get('id'))

In [14]:
len(title_ids)

1134

In [15]:
def get_movies_data(title_ids):
    movies = []
    for batch in tqdm(list(batched(title_ids, 10))):
        response = requests.get(base_url + 'titles:batchGet?titleIds=' + '&titleIds='.join(batch))
        if response.ok:
            movies += [dict(
                id=movie.get('id'),
                title=movie.get('primaryTitle'),
                startYear=movie.get('startYear'),
                runtimeMinutes=round(movie.get('runtimeSeconds') / 60 if data.get('runtimeSeconds') else None),
                numVotes=movie.get('rating', {}).get('voteCount'),
                countryOfOrigin=','.join(country.get('code', '') for country in movie.get('originCountries', []))
                ) for movie in response.json().get('titles', [])]
    return movies

In [16]:
movies = get_movies_data(title_ids)

100%|██████████| 114/114 [00:23<00:00,  4.77it/s]


In [17]:
len(movies)

1134

In [18]:
def get_awards_info(movies):
    for movie in tqdm(movies):
        response = requests.get(base_url + endpoint + movie['id'] + '/awardNominations')
        if response.ok:
            movie['awardsOrNominations'] = True if response.json() else False
    return movies

In [None]:
movies = get_awards_info(movies)
movies

100%|██████████| 1134/1134 [18:39<00:00,  1.01it/s]


[{'id': 'tt0359950',
  'title': 'The Secret Life of Walter Mitty',
  'startYear': 2013,
  'runtimeMinutes': 114,
  'numVotes': 358680,
  'countryOfOrigin': 'US,GB',
  'awardsOrNominations': True},
 {'id': 'tt0369610',
  'title': 'Jurassic World',
  'startYear': 2015,
  'runtimeMinutes': 124,
  'numVotes': 710065,
  'countryOfOrigin': 'JP,US',
  'awardsOrNominations': True},
 {'id': 'tt0062622',
  'title': '2001: A Space Odyssey',
  'startYear': 1968,
  'runtimeMinutes': 149,
  'numVotes': 758947,
  'countryOfOrigin': 'GB,US',
  'awardsOrNominations': True},
 {'id': 'tt0435651',
  'title': 'The Giver',
  'startYear': 2014,
  'runtimeMinutes': 97,
  'numVotes': 128925,
  'countryOfOrigin': 'CA,ZA,US',
  'awardsOrNominations': True},
 {'id': 'tt0437086',
  'title': 'Alita: Battle Angel',
  'startYear': 2019,
  'runtimeMinutes': 122,
  'numVotes': 314534,
  'countryOfOrigin': 'JP,CA,US',
  'awardsOrNominations': True},
 {'id': 'tt0088178',
  'title': 'Stop Making Sense',
  'startYear': 198

In [26]:
with open(results_dir / 'ts_additional_feats.csv', 'w') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=movies[0].keys())
    writer.writerows(movies)