## Netflix Program ID Scrape using Selenium - MongoDB store
- Selenium for searching and popup card to get program unique id and store it into MongoDB

In [None]:
from pathlib import Path
from lxml import html
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import pandas as pd
from datetime import datetime
import time
import random

## MongoDB Connect
from pymongo import MongoClient

start_time = time.time()
print("***** Netflix program id Crawl *****")

# Connect to the MongoDB server
client = MongoClient('mongodb://localhost:27017/')
# Database
db = client['netflix']
# Collection
collection = db['program_name']

# Paths
CURR_DIR = Path().absolute()
service_chrome_driver_path = Service(Path.joinpath(CURR_DIR, "chromedriver.exe"))
# Define Browser Options
chrome_opt = webdriver.ChromeOptions()
# chrome_opt.add_argument("--headless")  # Hides the browser window
chrome_opt.add_argument("--disable-notifications")
# Add the blink-settings argument
chrome_opt.add_argument('--blink-settings=imagesEnabled=false')

# Create a new Chrome browser instance
driver = webdriver.Chrome(service=service_chrome_driver_path, options=chrome_opt)
driver.maximize_window() # For maximizing window
driver.implicitly_wait(10) # gives an implicit wait for 20 seconds

# Read the program names from csv file
crawling_program_names_lst = pd.read_csv(Path.joinpath(CURR_DIR, "Netflix_Program_Names_all.csv"), usecols=['program_name'])['program_name'].tolist()

netflix_login_id = "user@google.com"
netflix_pass = "pass1234"

# Navigate to the login page
driver.get('https://www.netflix.com/in/login')

time.sleep(3)  # Let the user actually see something!

# Locate the email and password fields and enter the login credentials
email_field = driver.find_element("xpath", "//form//input[@id='id_userLoginId']")
password_field = driver.find_element("xpath", "//form//input[@id='id_password']")
email_field.send_keys(netflix_login_id)
password_field.send_keys(netflix_pass)

# Locate the login button and click it
login_button = driver.find_element("xpath", "//form//button[contains(@class, 'login-button')]")
login_button.click()

time.sleep(5)  # Let the user actually see something!
# Profile Select Last
profile_select = driver.find_element("xpath", "//div[@class='list-profiles']//li[@class='profile'][last()]//a")
profile_select.click()

time.sleep(4)  # Let the user actually see something!

def crawl_netflix_id(driver, keyword, collection, row_count):
    ''' Search program name and crawl netflix id '''
    # res = driver.page_source
    # tree = html.fromstring(res)
    # search_box_open_xp = tree.xpath("//div[@class='searchBox']//button[@class='searchTab'")
    query_keyword = keyword.lower().replace(" ", "%20")
    driver.get(f'https://www.netflix.com/search?q={query_keyword}')
    # time.sleep(random.uniform(0.5, 1.2))  # Let the user actually see something!

    # keyword_lst = [letter for letter in keyword.lower()]
    # # Search Box open
    # if driver.find_element("xpath", "//div[@class='searchBox']//button[@class='searchTab']"):
    #     search_box_open = driver.find_element("xpath", "//div[@class='searchBox']//button[@class='searchTab']")
    #     search_box_open.click()
    #
    #     time.sleep(2)  # Keybord popup
    # search_box = driver.find_element("xpath", "//div[@class='searchBox']//input[@id='searchInput']")
    #
    # # Search keyword - Typing effect
    # for letter in keyword_lst:
    #     time.sleep(random.uniform(0.6, 1.2))
    #     search_box.send_keys(letter)
    # time.sleep(3)  # Wait for ajax resp

    # Collect all list of Netflix id along with Names
    searched_programs_lst = []

    program_blocks_xp = driver.find_elements("xpath", "//div[contains(@class, 'slider-item')]")
    for program_block in program_blocks_xp:
        empty_program_blocks_xp = program_block.find_elements("xpath", ".//div[contains(@class, 'title-card-container')]")
        if not empty_program_blocks_xp:
            # Skip Empty block
            continue
        raw_url = program_block.find_element("xpath", ".//a").get_attribute('href')
        netflix_program_name = program_block.find_element("xpath", ".//p").text
        netflix_id = raw_url.split('netflix.com/watch/')[1].split('?')[0]

        # If netflix_id exist inside collection then do not append the data to list
        if not collection.find_one({'netflix_id': netflix_id}):
            searched_programs_lst.append({
                'netflix_id': netflix_id,
                'netflix_program_name': netflix_program_name,
                'netflix_program_url': f'https://www.netflix.com/in/title/{netflix_id}',
                'raw_url': raw_url,
                'scrape_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    # Insert a document into the Mongo collection
    if searched_programs_lst:
        collection.insert_many(searched_programs_lst)
        print(f"{row_count}. {keyword} - {len(searched_programs_lst)} records inserted.")
    else:
        print(f"{row_count}. {keyword} - No new data found !!!")

    # # Search Box close/clear
    # search_box_close = driver.find_element("xpath", "//div[@class='searchBox']//span[@role='button']")
    # search_box_close.click()


# Program names keyword loop
for row_count, keyword in enumerate(crawling_program_names_lst):
    crawl_netflix_id(driver, keyword, collection, row_count+1)

end_time = time.time()
print(f"--- {round(end_time - start_time, 4)} seconds ---")
driver.quit()