# IMDB Search and crawl

In [17]:
import requests
from lxml import html
import pandas as pd
import re
import json
#import logging
#logging.basicConfig(level = logging.DEBUG)

headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
        'accept-language': 'en-US,en;q=0.9'
    }

def parse_page(url, headers):
    ''' Return: parsed tree form response'''
    res = requests.get(url, headers=headers)
    if res.status_code == 200:
        tree = html.fromstring(res.content)
        return tree
    else:
        return None
    

def imdb_search_res(tree):    
    search_res_script_json_data_xp =  tree.xpath("//script[@id='__NEXT_DATA__']/text()")
    if search_res_script_json_data_xp:
        search_res_json_data = json.loads(search_res_script_json_data_xp[0])
        
        search_res_data = []
        for row in range(len(search_res_json_data['props']['pageProps']['titleResults']['results'])):
#             search_res_id = search_res_json_data['props']['pageProps']['titleResults']['results'][row]['id']
#             search_res_url = f"https://www.imdb.com/title/{search_res_id}/"
#             search_res_title = search_res_json_data['props']['pageProps']['titleResults']['results'][row]['titleNameText']
#             search_res_content_type = search_res_json_data['props']['pageProps']['titleResults']['results'][row]['titleTypeText']
#             search_res_release_year = search_res_json_data['props']['pageProps']['titleResults']['results'][row]['titleReleaseText']
            search_res_id = search_res_json_data['props']['pageProps']['titleResults']['results'][row].get('id')
            search_res_url = f"https://www.imdb.com/title/{search_res_id}/"
            search_res_title = search_res_json_data['props']['pageProps']['titleResults']['results'][row].get('titleNameText')
            search_res_content_type = search_res_json_data['props']['pageProps']['titleResults']['results'][row].get('titleTypeText')
            search_res_release_year = search_res_json_data['props']['pageProps']['titleResults']['results'][row].get('titleReleaseText')


            search_res_data.append({
                'search_res_title': search_res_title,
                'search_res_content_type': search_res_content_type,
                'search_res_release_year': search_res_release_year,
                'search_res_url': search_res_url
            })
        return search_res_data
    else:
        return None
        

def imdb_program_crawl(tree):
    program_script_json_data_xp =  tree.xpath("//script[@id='__NEXT_DATA__']/text()")
    if program_script_json_data_xp:
        program_json_data = json.loads(program_script_json_data_xp[0])
        
        total_episodes = ''
        
        releaseYear = program_json_data['props']['pageProps']['aboveTheFoldData']['releaseYear']['year']
        endYear = program_json_data['props']['pageProps']['aboveTheFoldData']['releaseYear']['endYear']
        YearRange = f"{releaseYear} - {endYear}"
        if program_json_data['props']['pageProps']['mainColumnData']['episodes']:
            total_episodes = program_json_data['props']['pageProps']['mainColumnData']['episodes']['episodes']['total']
        
        program_data_dict = {
            'titleText': program_json_data['props']['pageProps']['aboveTheFoldData']['titleText']['text'],
            'titleType': program_json_data['props']['pageProps']['aboveTheFoldData']['titleType']['text'],
            'genres': ','.join([genres_text['text'] for genres_text in program_json_data['props']['pageProps']['aboveTheFoldData']['genres']['genres']]),
            'Languages': ','.join([language['text'] for language in program_json_data['props']['pageProps']['mainColumnData']['spokenLanguages']['spokenLanguages']]),
            'originalTitleText': program_json_data['props']['pageProps']['aboveTheFoldData']['originalTitleText']['text'],
            'YearRange': YearRange,
            'runtime': program_json_data['props']['pageProps']['aboveTheFoldData']['runtime']['seconds'],
            'aggregateRating': program_json_data['props']['pageProps']['aboveTheFoldData']['ratingsSummary']['aggregateRating'],
            'voteCount': program_json_data['props']['pageProps']['aboveTheFoldData']['ratingsSummary']['voteCount'],
            'meterRanking': program_json_data['props']['pageProps']['aboveTheFoldData']['meterRanking']['currentRank'],
            'total_episodes': total_episodes,

            'runtime': program_json_data['props']['pageProps']['aboveTheFoldData']['runtime']['seconds'],
            'runtime': program_json_data['props']['pageProps']['aboveTheFoldData']['runtime']['seconds'],
        } 
        return program_data_dict
    else:
        return None

def display_series_metadata(program_data, series_episodes_data=None):
    ''' Display Selected metadata '''
    metadata_keys_lst = ['titleText', 'titleType', 'genres', 'Languages', 'runtime', 'YearRange', 'originalTitleText', 'total_episodes']
    selected_program_data = {key: program_data[key] for key in metadata_keys_lst if key in program_data}
    
    # program metadata Dataframe print
    with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
        display(pd.DataFrame.from_dict(selected_program_data, orient='index', columns=['Metadata']))
      
    if selected_program_data['total_episodes'] is not None:
        # Series episodes Dataframe print
        with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
            display(pd.DataFrame(series_episodes_data))

        
def imdb_series_episodes_crawl(tree):
    series_script_json_data_xp =  tree.xpath("//script[@id='__NEXT_DATA__']/text()")
    if series_script_json_data_xp:
        series_script_json_data = json.loads(series_script_json_data_xp[0])
    
        series_data = {
            'titleText': program_json_data['props']['pageProps']['aboveTheFoldData']['titleText']['text'],
        } 
        return series_data
    else:
        return None

        
def main():
    home_url = 'https://www.imdb.com/'
    print("***** IMDB Data crawl crom search *****")
    search_keyword = input("Enter Movie/Series Name:").replace(' ', '+')
    search_url = f"https://www.imdb.com/find?q={search_keyword}"
    
    search_tree = parse_page(search_url, headers)
    if search_tree is not None:
        search_res_data = imdb_search_res(search_tree)
        if search_res_data is not None:
            print("","*** Search Result Titles ***", sep="\n")
            for row, search_res in enumerate(search_res_data, start=1):
                print(f"{row}. {search_res['search_res_title']}")
            
            print("-----------------------------")
            selection = int(input("Select the number from above list:"))
            program_url = search_res_data[selection-1]['search_res_url']
            # Crawl program data
            program_tree = parse_page(program_url, headers)
            program_data = imdb_program_crawl(program_tree)
            
            if program_data.get('total_episodes'):
                series_episodes_url = program_url + 'episodes/'
                # Crawl Series episodes data
                series_tree = parse_page(series_episodes_url, headers)
                series_episodes_data = imdb_series_episodes_crawl(series_tree)
                # Display program + Episodes metadata
                display_series_metadata(program_data, series_episodes_data)
            else:
                # Display program metadata
                display_series_metadata(program_data)
            
    else:
        print("Access Denied")

if __name__ == '__main__':
    main()

***** IMDB Data crawl crom search *****
Enter Movie/Series Name:wednesday

*** Search Result Titles ***
1. Wednesday
2. Big Wednesday
3. A Wednesday
4. aTypical Wednesday
5. Fireworks Wednesday
-----------------------------
Select the number from above list:1


Unnamed: 0,Metadata
titleText,Wednesday
titleType,TV Series
genres,"Comedy,Crime,Fantasy,Mystery"
Languages,English
runtime,2700
YearRange,2022 - None
originalTitleText,Wednesday
total_episodes,9


# Testing

In [2]:
import requests
from lxml import html
import pandas as pd
import re
import json

#import logging
#logging.basicConfig(level = logging.DEBUG)

In [3]:
headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
        'accept-language': 'en-US,en;q=0.9'
    }

In [4]:
home_url = 'https://www.imdb.com/'

In [5]:
print("***** IMDB Data crawl crom search *****")
search_keyword = input("Enter Movie/Series Name:").replace(' ', '+')
url = f"https://www.imdb.com/find?q={search_keyword}"
url

***** IMDB Data crawl crom search *****
Enter Movie/Series Name:wednesday


'https://www.imdb.com/find?q=wednesday'

In [6]:
res = requests.get(url, headers=headers)
tree = html.fromstring(res.content)
res.status_code

200

In [7]:
search_res_script_json_data_xp =  tree.xpath("//script[@id='__NEXT_DATA__']/text()")
search_res_json_data = json.loads(search_res_script_json_data_xp[0])

In [8]:
search_res_json_data['props']['pageProps']['titleResults']['results'][0]

{'id': 'tt13443470',
 'titleNameText': 'Wednesday',
 'titleReleaseText': '2022– ',
 'titleTypeText': 'TV Series',
 'titlePosterImageModel': {'url': 'https://m.media-amazon.com/images/M/MV5BM2ZmMjEyZmYtOGM4YS00YTNhLWE3ZDMtNzQxM2RhNjBlODIyXkEyXkFqcGdeQXVyMTUzMTg2ODkz._V1_.jpg',
  'maxHeight': 2222,
  'maxWidth': 1500,
  'caption': 'Jenna Ortega in Wednesday (2022)'},
 'topCredits': ['Jenna Ortega', 'Hunter Doohan'],
 'imageType': 'tvSeries'}

In [9]:
search_res_data = []
for row in range(len(search_res_json_data['props']['pageProps']['titleResults']['results'])):
    search_res_id = search_res_json_data['props']['pageProps']['titleResults']['results'][row]['id']
    search_res_url = f"https://www.imdb.com/title/{search_res_id}/"
    search_res_title = search_res_json_data['props']['pageProps']['titleResults']['results'][row]['titleNameText']
    search_res_content_type = search_res_json_data['props']['pageProps']['titleResults']['results'][row]['titleTypeText']
    search_res_release_year = search_res_json_data['props']['pageProps']['titleResults']['results'][row]['titleReleaseText']
    
    search_res_data.append({
        'search_res_title': search_res_title,
        'search_res_content_type': search_res_content_type,
        'search_res_release_year': search_res_release_year,
        'search_res_url': search_res_url
    })
    
search_res_data

[{'search_res_title': 'Wednesday',
  'search_res_content_type': 'TV Series',
  'search_res_release_year': '2022– ',
  'search_res_url': 'https://www.imdb.com/title/tt13443470/'},
 {'search_res_title': 'Big Wednesday',
  'search_res_content_type': '',
  'search_res_release_year': '1978',
  'search_res_url': 'https://www.imdb.com/title/tt0077235/'},
 {'search_res_title': 'A Wednesday',
  'search_res_content_type': '',
  'search_res_release_year': '2008',
  'search_res_url': 'https://www.imdb.com/title/tt1280558/'},
 {'search_res_title': 'aTypical Wednesday',
  'search_res_content_type': '',
  'search_res_release_year': '2020',
  'search_res_url': 'https://www.imdb.com/title/tt9324808/'},
 {'search_res_title': 'Fireworks Wednesday',
  'search_res_content_type': '',
  'search_res_release_year': '2006',
  'search_res_url': 'https://www.imdb.com/title/tt0845439/'}]

### Program crawl - Script JSON Data 

In [10]:
url = 'https://www.imdb.com/title/tt6468322/?ref_=fn_al_tt_1'
# url = 'https://www.netflix.com/search?q=wednesday&jbv=81231974'
res = requests.get(url, headers=headers)
tree = html.fromstring(res.content)
res.status_code

200

In [11]:
program_script_json_data_xp =  tree.xpath("//script[@id='__NEXT_DATA__']/text()")
program_json_data = json.loads(program_script_json_data_xp[0])

In [12]:
releaseYear = program_json_data['props']['pageProps']['aboveTheFoldData']['releaseYear']['year']
endYear = program_json_data['props']['pageProps']['aboveTheFoldData']['releaseYear']['endYear']
YearRange = f"{releaseYear} - {endYear}"

program_data_dict = {
    'titleText': program_json_data['props']['pageProps']['aboveTheFoldData']['titleText']['text'],
    'titleType': program_json_data['props']['pageProps']['aboveTheFoldData']['titleType']['text'],
    'genres': ','.join([genres_text['text'] for genres_text in program_json_data['props']['pageProps']['aboveTheFoldData']['genres']['genres']]),
    'Languages': ','.join([language['text'] for language in program_json_data['props']['pageProps']['mainColumnData']['spokenLanguages']['spokenLanguages']]),
    'originalTitleText': program_json_data['props']['pageProps']['aboveTheFoldData']['originalTitleText']['text'],
    'YearRange': YearRange,
    'runtime': program_json_data['props']['pageProps']['aboveTheFoldData']['runtime']['seconds'],
    'aggregateRating': program_json_data['props']['pageProps']['aboveTheFoldData']['ratingsSummary']['aggregateRating'],
    'voteCount': program_json_data['props']['pageProps']['aboveTheFoldData']['ratingsSummary']['voteCount'],
    'meterRanking': program_json_data['props']['pageProps']['aboveTheFoldData']['meterRanking']['currentRank'],
    'meterRanking': program_json_data['props']['pageProps']['aboveTheFoldData']['meterRanking']['currentRank'], 
    
    'runtime': program_json_data['props']['pageProps']['aboveTheFoldData']['runtime']['seconds'],
    'runtime': program_json_data['props']['pageProps']['aboveTheFoldData']['runtime']['seconds'],
}
    
pd.DataFrame.from_dict(program_data_dict, orient='index', columns=['Metadata'])

Unnamed: 0,Metadata
titleText,Money Heist
titleType,TV Series
genres,"Action,Crime,Drama,Mystery,Thriller"
Languages,"Spanish,Russian,Serbian,English"
originalTitleText,La casa de papel
YearRange,2017 - 2021
runtime,4200
aggregateRating,8.2
voteCount,491631
meterRanking,251


In [13]:
program_json_data['props']['pageProps']['mainColumnData']['spokenLanguages']['spokenLanguages']

[{'id': 'es', 'text': 'Spanish', '__typename': 'SpokenLanguage'},
 {'id': 'ru', 'text': 'Russian', '__typename': 'SpokenLanguage'},
 {'id': 'sr', 'text': 'Serbian', '__typename': 'SpokenLanguage'},
 {'id': 'en', 'text': 'English', '__typename': 'SpokenLanguage'}]

## IMDB Datasets

In [None]:
from pathlib import Path

CURR_DIR = Path().absolute()
PARENT_DIR = CURR_DIR.parent.absolute()

df = pd.read_csv(Path.joinpath(PARENT_DIR, 'Datasets/data.tsv'),sep='\t', encoding='UTF-8', na_values='\\N')
df

In [None]:
df['titleType'].value_counts()

In [None]:
from pathlib import Path
import pandas as pd
import re

from tqdm.auto import tqdm
tqdm.pandas()

# Current exec dir
CURR_DIR = Path().absolute()
# Parent dir
parent_dir = CURR_DIR.parent.absolute()
# Dataset DIR
dataset_dir = Path.joinpath(parent_dir, 'Datasets')

imdb_name_basics_df = pd.read_csv(Path.joinpath(dataset_dir, "imdb_name_basics.tsv"), sep='\t', na_filter='\\n')
imdb_title_akas_df = pd.read_csv(Path.joinpath(dataset_dir, "imdb_title_akas.tsv"), sep='\t', na_filter='\\n')
imdb_title_basics_df = pd.read_csv(Path.joinpath(dataset_dir, "imdb_title_basics.tsv"), sep='\t', na_filter='\\n')
imdb_title_crew_df = pd.read_csv(Path.joinpath(dataset_dir, "imdb_title_crew.tsv"), sep='\t', na_filter='\\n')
imdb_title_episode_df = pd.read_csv(Path.joinpath(dataset_dir, "imdb_title_episode.tsv"), sep='\t', na_filter='\\n')
imdb_title_principals_df = pd.read_csv(Path.joinpath(dataset_dir, "imdb_title_principals.tsv"), sep='\t', na_filter='\\n')
imdb_title_ratings_df = pd.read_csv(Path.joinpath(dataset_dir, "imdb_title_ratings.tsv"), sep='\t', na_filter='\\n')

# with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
#     display(df)

In [None]:
import ipywidgets as widgets
from ipywidgets import interactive

files_path_list = list(map(str, dataset_dir.glob("*.tsv")))
all_df_names = [file_name.split('\\')[-1].split('.')[0] for file_name in files_path_list]

def on_value_change(change):
    # Set global df if want to check selected dataframe
    global df

    dynamic_df_file_name = all_df_names[change['new']]+'_df'
    df = eval(dynamic_df_file_name).head(10).copy()

    with report_output:
        report_output.clear_output()
        display(f"File Name: {dynamic_df_file_name}")
        display(df)

w_int_box = widgets.BoundedIntText(value=0, min=0, max=len(all_df_names)-1, step=1, description='Data Frame: ')

report_output = widgets.Output()

w_int_box.observe(on_value_change, names='value')
display(w_int_box)
display(report_output)

In [None]:
# Title and region info
imdb_title_akas_df = ['titleId', 'ordering', 'title', 'region', 'language', 'types', 'attributes', 'isOriginalTitle']
# content type, primary title, genres
imdb_title_basics_df = ['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres']
# Episode info
imdb_title_episode_df = ['tconst', 'parentTconst', 'seasonNumber', 'episodeNumber']

In [None]:
df2 = imdb_title_akas_df[imdb_title_akas_df['region'] == 'IN'].copy()

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
    df2.sort_values('titleId')
    display(df2.head(10000))

In [None]:
df2.fillna('', inplace=True)

In [None]:
df2[df2['title'].str.contains(r'^Billi')]