In [1]:
import os
import json
import sys
import requests
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
# Select the streamings plattaforms you want
streamings = {
    'amazon':'amp',
    'disney':'dnp',
    'hbo':'hbm',
    'netflix':'nfx',
    'paramount':'pmp',
    'appletv':'atp'
}

In [3]:
url = "https://apis.justwatch.com/graphql"

In [4]:
headers = {
    "content-type": "application/json",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
    "accept-encoding": "gzip, deflate, br"
}

In [5]:
with open('src/postData.json','r',encoding='utf-8') as file:
    postData = json.load(file)

In [6]:
with open('src/query.graphql', 'r', encoding='utf-8') as file:
    query = file.read()

In [7]:
postData['query'] = query

In [8]:
def setStreaming(plataform):
    """ Set the streaming on query variables. """
    
    postData['variables']['popularTitlesFilter']['packages'] = [streamings[plataform ]]

In [9]:
releasedYear =  [1899, 1950, 1980, 1990, 2000, 2010, 2012, 2014, 2016, 2018, 2020, 2022, 2023]

In [10]:
def getTitles(plataform, cursor = None, titles = None, start = True):
    """ Get all titles available of a plataform. """
    
    if not titles:
        titles = []
        
    if cursor and not start:
        postData['variables']['popularAfterCursor'] = cursor
    else:
        postData['variables']['popularAfterCursor'] = ""
        
    setStreaming(plataform)
    
    ret = requests.post(url, data=json.dumps(postData), headers=headers)
    if ret.status_code != 200:
        raise requests.ConnectionError('Connection failed')
        sys.exit(-1)
    
    results = ret.json()['data']['popularTitles']
    titles.extend(results['edges'])
    
    if results['pageInfo']['hasNextPage']:
        cursor = results['pageInfo']['endCursor']
        getTitles(plataform=plataform, cursor=cursor, titles=titles, start=False)

    return titles

In [11]:
def titleContentParser(title):
    """ Parse the title content to a dictionary. """

    show = title['node']

    content = {}
    content['id'] = show['id']
    content['title'] = show['content']['title']
    content['type'] = show['objectType']
    content['description'] = show['content']['shortDescription']
    content['release_year'] = show['content']['originalReleaseYear']
    content['age_certification'] = show['content']['ageCertification']
    content['runtime'] = show['content']['runtime']
    content['genres'] = [i['technicalName'] for i in show['content']['genres']]
    content['production_countries'] = show['content']['productionCountries']
    content['seasons'] = show.get('totalSeasonCount', None)
    content['imdb_id'] = show['content']['externalIds']['imdbId']
    content['imdb_score'] = show['content']['scoring']['imdbScore']
    content['imdb_votes'] = show['content']['scoring']['imdbVotes']
    content['tmdb_popularity'] = show['content']['scoring']['tmdbPopularity']
    content['tmdb_score'] = show['content']['scoring']['tmdbScore']

    credits = [
        {
            'person_id': i['personId'],
            'id': content['id'],
            'name': i['name'],
            'character': i['characterName'],
            'role': i['role']
        } for i in show['content']['credits']
    ]

    return content, credits

In [12]:
def saveData(data, save = True, path = ''):
    """ Parse a list of titles and save it to a file. """
    
    titles = []
    credits = []
    
    for title in data:
        content, credit = titleContentParser(title)
        titles.append(content)
        credits.extend(credit)
        
    if save:
        titlesDf = pd.DataFrame(titles)
        creditsDf = pd.DataFrame(credits)
            
        titlesDf.to_csv(path+'titles.csv', index=False)
        creditsDf.to_csv(path+'credits.csv', index=False)
        
    return titles, credits #To enable to see the data if save=False

In [13]:
def getPlataformTitles(plataform, save=True):
    """ Get all titles available on a given plataform. """
    
    raw = []
    for i in range(len(releasedYear) - 1):
        filterRange = {'min': releasedYear[i]+1, 'max': releasedYear[i+1]}
        
        postData['variables']['popularTitlesFilter']['releaseYear'] = filterRange  # Set the filter
        
        yearTitles = getTitles(plataform=plataform)
        raw.extend(yearTitles)
    
    if save:
        filePath = f'../data/{plataform}/' #Force to put the data in the 'data' directory
        if not os.path.exists(filePath):
            os.mkdir(filePath)
            
    titles, credits = saveData(data=raw, save=save, path=filePath)
    
    return titles, credits #To enable to see the data if save=False

In [14]:
def main():
    data = {} #If you want to use de data after
    for plataform in tqdm(streamings.keys()):
        titles, credits = getPlataformTitles(plataform=plataform, save=True,)
        
        data[plataform] = {
            'titles': titles,
            'credits': credits
        }
        
        print(f"Titles in {plataform}: {len(data[plataform]['titles'])}")
        print(f"Credits in {plataform}: {len(data[plataform]['credits'])}",end='\n\n')
        
            

In [15]:
main()

  0%|          | 0/6 [00:00<?, ?it/s]

Titles in amazon: 10877
Credits in amazon: 140657

Titles in disney: 1855
Credits in disney: 30693

Titles in hbo: 3032
Credits in hbo: 64926

Titles in netflix: 6135
Credits in netflix: 81402

Titles in paramount: 3181
Credits in paramount: 51214

Titles in appletv: 171
Credits in appletv: 1869

