## Extracts Netflix series info from program url. 

This project can be useful for extracting and analyzing metadata from Netflix series for various purposes such as content analysis, recommendation systems, and data-driven decision making in the entertainment industry.

In [3]:
import requests
from lxml import html
import pandas as pd
import re
import json
#import logging
#logging.basicConfig(level = logging.DEBUG)


headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
    }


def parse_page(url, headers):
    ''' Return: parsed tree of the response'''
    res = requests.get(url, headers=headers)
    if res.status_code == 200:
        tree = html.fromstring(res.content)
        return tree
    else:
        return None


# def generate_netflix_series_link(series_url):
#     ''' Search and generate Netflix series link '''
#     url = f'https://www.netflix.com/search?q={search_keyword}'
#     # Request url and get program_id - Login Needed to get id
#     res = requests.get(url, headers=headers)
#     if res.status_code == 200:
#         tree = html.fromstring(res.content)
#         # First block program id
#         program_id = tree.xpath("//div[@id='title-card-0-0']//a[@class='slider-refocus']/@href")
#         # Program Card popup
#         series_url = f'https://www.netflix.com/search?q={search_keyword}&jbv={program_id}'
#     else:
#         logging.warning(f"Warning: access denied {res.status_code}")
#     pass
    

def netflix_series_crawl(tree):    
    program_data_lst = []
    # //section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season']
    seasons = tree.xpath("//section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season']")

    if seasons:
        for season_no_row in range(1, len(seasons)+1):
            episodes = tree.xpath(f"//section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season'][{season_no_row}]//li[@class='episode']")

            for episode_no_row in range(1, len(episodes)+1):            
                program_name = season_no = episode_no = episode_name = episode_runtime = content_name = ''

                program_name_xp = tree.xpath("//div[@class='title-info']//h1[@class='title-title']/text()")
                season_no_xp = tree.xpath(f"//section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season'][{season_no_row}]//li[@class='episode'][{episode_no_row}]//img[@class='episode-thumbnail-image']/@alt")
                episode_name_xp = tree.xpath(f"//section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season'][{season_no_row}]//li[@class='episode'][{episode_no_row}]//h3[@class='episode-title']/text()")
                episode_runtime_xp = tree.xpath(f"//section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season'][{season_no_row}]//li[@class='episode'][{episode_no_row}]//span[@class='episode-runtime']/text()")

                if program_name_xp:
                    program_name = program_name_xp[0]

                if season_no_xp:
                    season_no = re.sub('[^0-9]+' ,'' , season_no_xp[0].lower().split('season')[-1])

                if episode_name_xp:
                    episode_no = episode_name_xp[0].split('.')[0]
                    episode_name = episode_name_xp[0].split('.', maxsplit=1)[1].strip()

                if episode_runtime_xp:
                    episode_runtime = episode_runtime_xp[0].strip()

                if program_name and season_no and episode_no and episode_name:
                    content_name = f'{program_name} S{season_no}:E{episode_no} "{episode_name}"'

                program_data_lst.append({
                    'program_name': program_name,
                    'season_no': season_no,
                    'episode_no': episode_no,
                    'episode_name': episode_name,
                    'episode_runtime': episode_runtime,
                    'content_name': content_name
                })

        return program_data_lst
        

def main():
    print("***** Netflix series data crawl from link *****")
    print("Note series link e.g: https://www.netflix.com/search?q=Manifest&jbv=80241318")
    # Paste the series url in directly to bypass login 
    # 'https://www.netflix.com/search?q=Manifest&jbv=80241318'
    # 'https://www.netflix.com/search?q=wednesday&jbv=81231974'
    
    url = input("Paste the series url:")
    
    tree = parse_page(url, headers)
    
    if tree is not None:
        program_data_lst = netflix_series_crawl(tree)
    
        # Dataframe print
        with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
            display(pd.DataFrame(program_data_lst))
    else:
        print("Access Denied")

if __name__ == '__main__':
    main()

***** Netflix series data crawl from link *****
Note series link e.g: https://www.netflix.com/search?q=Manifest&jbv=80241318
Paste the series url:https://www.netflix.com/search?q=vik&jbv=80189685


Unnamed: 0,program_name,season_no,episode_no,episode_name,episode_runtime,content_name
0,The Witcher,1,1,The End’s Beginning,61m,"The Witcher S1:E1 ""The End’s Beginning"""
1,The Witcher,1,2,Four Marks,61m,"The Witcher S1:E2 ""Four Marks"""
2,The Witcher,1,3,Betrayer Moon,67m,"The Witcher S1:E3 ""Betrayer Moon"""
3,The Witcher,1,4,"Of Banquets, Bastards and Burials",63m,"The Witcher S1:E4 ""Of Banquets, Bastards and Burials"""
4,The Witcher,1,5,Bottled Appetites,60m,"The Witcher S1:E5 ""Bottled Appetites"""
5,The Witcher,1,6,Rare Species,60m,"The Witcher S1:E6 ""Rare Species"""
6,The Witcher,1,7,Before a Fall,48m,"The Witcher S1:E7 ""Before a Fall"""
7,The Witcher,1,8,Much More,60m,"The Witcher S1:E8 ""Much More"""
8,The Witcher,2,1,A Grain of Truth,63m,"The Witcher S2:E1 ""A Grain of Truth"""
9,The Witcher,2,2,Kaer Morhen,59m,"The Witcher S2:E2 ""Kaer Morhen"""


# Block Testing

In [None]:
import requests
from lxml import html
import pandas as pd
import re
import json

#import logging
#logging.basicConfig(level = logging.DEBUG)

In [None]:
headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
    }

In [None]:
url = 'https://www.netflix.com/search?q=Manifest&jbv=80241318'
# url = 'https://www.netflix.com/search?q=wednesday&jbv=81231974'
res = requests.get(url, headers=headers)
tree = html.fromstring(res.content)
res.status_code

In [None]:
seasons_block_xp = tree.xpath("//section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season']")
len(seasons_block_xp)

In [None]:
program_data_lst = []
# //section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season']
seasons = tree.xpath("//section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season']")

if seasons:
    for season_no_row in range(1, len(seasons)+1):
        episodes = tree.xpath(f"//section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season'][{season_no_row}]//li[@class='episode']")
        
        for episode_no_row in range(1, len(episodes)+1):            
            program_name = season_no = episode_no = episode_name = episode_runtime = content_name = ''

            program_name_xp = tree.xpath("//div[@class='title-info']//h1[@class='title-title']/text()")
            season_no_xp = tree.xpath(f"//section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season'][{season_no_row}]//li[@class='episode'][{episode_no_row}]//img[@class='episode-thumbnail-image']/@alt")
            episode_name_xp = tree.xpath(f"//section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season'][{season_no_row}]//li[@class='episode'][{episode_no_row}]//h3[@class='episode-title']/text()")
            episode_runtime_xp = tree.xpath(f"//section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season'][{season_no_row}]//li[@class='episode'][{episode_no_row}]//span[@class='episode-runtime']/text()")

            if program_name_xp:
                program_name = program_name_xp[0]

            if season_no_xp:
                season_no = re.sub('[^0-9]+' ,'' , season_no_xp[0].lower().split('season')[-1])

            if episode_name_xp:
                episode_no = episode_name_xp[0].split('.')[0]
                episode_name = episode_name_xp[0].split('.', maxsplit=1)[1].strip()

            if episode_runtime_xp:
                episode_runtime = episode_runtime_xp[0].strip()

            if program_name and season_no and episode_no and episode_name:
                content_name = f'{program_name} S{season_no}:E{episode_no} "{episode_name}"'

            program_data_lst.append({
                'program_name': program_name,
                'season_no': season_no,
                'episode_no': episode_no,
                'episode_name': episode_name,
                'episode_runtime': episode_runtime,
                'content_name': content_name
            })

pd.DataFrame(program_data_lst)

In [None]:
episode_name_xp = tree.xpath(f"//section[@id='section-seasons-and-episodes']//li[@class='episode']//h3[@class='episode-title']/text()")
episode_name_xp

In [None]:
season_xp = tree.xpath("//section[@id='section-seasons-and-episodes']//select[@data-uia='season-selector']/@id")
season_xp

In [None]:
script_xp = tree.xpath("//script[contains(text(), 'netflix.reactContext')]/text()")
script_xp[0]

In [None]:
episode_name_xp = tree.xpath("//section[@id='section-seasons-and-episodes']//ol[@class='episodes-container']//li[@class='episode']//h3[@class='episode-title']/text()")
episode_name_xp

In [None]:
episode_block_xp = tree.xpath("//section[@id='section-seasons-and-episodes']//ol[@class='episodes-container']//li[@class='episode']")
len(episode_block_xp)

## Backup Old - issue accessing normal for loop

In [None]:
import requests
from lxml import html
import pandas as pd
import re
import json

#import logging
#logging.basicConfig(level = logging.DEBUG)

In [None]:
headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
    }

In [None]:
url = 'https://www.netflix.com/search?q=Manifest&jbv=80241318'
# url = 'https://www.netflix.com/search?q=wednesday&jbv=81231974'
res = requests.get(url, headers=headers)
tree = html.fromstring(res.content)
res.status_code

In [None]:
episode_block_xp = tree.xpath("//section[@id='section-seasons-and-episodes']//li[@class='episode']")
len(episode_block_xp)

In [None]:
program_data_lst = []

# //section[@id='section-seasons-and-episodes']//div[contains(@class, 'season') or contains(@class, 'season season-active')]
# //section[@id='section-seasons-and-episodes']//div[@class='season season-active' or @class='season']
episode_block_xp = tree.xpath("//section[@id='section-seasons-and-episodes']//li[@class='episode']")

if episode_block_xp:
    for block_no in range(1, len(episode_block_xp)+1):
        program_name = season_no = episode_no = episode_name = episode_runtime = content_name = ''

        program_name_xp = tree.xpath("//div[@class='title-info']//h1[@class='title-title']/text()")
        season_no_xp = tree.xpath(f"//section[@id='section-seasons-and-episodes']//li[@class='episode'][{block_no}]//img[@class='episode-thumbnail-image']/@alt")
        episode_name_xp = tree.xpath(f"//section[@id='section-seasons-and-episodes']//li[@class='episode'][{block_no}]//h3[@class='episode-title']/text()")
        episode_runtime_xp = tree.xpath(f"//section[@id='section-seasons-and-episodes']//li[@class='episode'][{block_no}]//span[@class='episode-runtime']/text()")

        if program_name_xp:
            program_name = program_name_xp[0]

        if season_no_xp:
            season_no = re.sub('[^0-9]+' ,'' , season_no_xp[0].lower().split('season')[-1])

        if episode_name_xp:
            episode_no = episode_name_xp[0].split('.')[0]
            episode_name = episode_name_xp[0].split('.', maxsplit=1)[1].strip()

        if episode_runtime_xp:
            episode_runtime = episode_runtime_xp[0].strip()

        if program_name and season_no and episode_no and episode_name:
            content_name = f'{program_name} S{season_no}:E{episode_no} "{episode_name}"'

        program_data_lst.append({
            'program_name': program_name,
            'season_no': season_no,
            'episode_no': episode_no,
            'episode_name': episode_name,
            'episode_runtime': episode_runtime,
            'content_name': content_name
        })
    
pd.DataFrame(program_data_lst)

In [None]:
episode_name_xp = tree.xpath(f"//section[@id='section-seasons-and-episodes']//li[@class='episode']//h3[@class='episode-title']/text()")
episode_name_xp

In [None]:
season_xp = tree.xpath("//section[@id='section-seasons-and-episodes']//select[@data-uia='season-selector']/@id")
season_xp

In [None]:
script_xp = tree.xpath("//script[contains(text(), 'netflix.reactContext')]/text()")
script_xp[0]

In [None]:
episode_name_xp = tree.xpath("//section[@id='section-seasons-and-episodes']//ol[@class='episodes-container']//li[@class='episode']//h3[@class='episode-title']/text()")
episode_name_xp

In [None]:
episode_block_xp = tree.xpath("//section[@id='section-seasons-and-episodes']//ol[@class='episodes-container']//li[@class='episode']")
len(episode_block_xp)