In [7]:
# logger, i'll use one logger for this script

import logging
import sys

logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s')

stdout_handler = logging.StreamHandler(sys.stdout)
stdout_handler.setLevel(logging.DEBUG)
stdout_handler.setFormatter(formatter)

file_handler = logging.FileHandler('logs.log')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)


logger.addHandler(file_handler)
logger.addHandler(stdout_handler)

In [8]:
# the goal is to scrape all movie ids. but all movies page has 660k entries
# clicking a button for 13000 times is not reliable so we need to scrape movies
# in batches. we can use various filtering options.
# filtering options:
# - year
# - imdb rating

imdb_link = "https://www.imdb.com/search/title/?title_type=feature"

increment = 0.1
imdb_links = []
for i in range(1, 10):
    for j in range(5):
        jj = j / 5
        imdb_links.append(
            "https://www.imdb.com/search/title/?title_type=feature&user_rating=" +
            str(round(i + jj, 1)) + "," + str(round(i + jj + increment, 1))
        )
# there actually are movies with 10 ratings
imdb_links.append("https://www.imdb.com/search/title/?title_type=feature&user_rating=10,")

#print(imdb_links)
#for link in imdb_links:
#    print(link)

# i shouldnt be doing imdb ratings because ratings can change while the scraping is happening
# i need to do this with dates. which would be a lot better because that way updating the
# database is just a breeze



In [9]:
from datetime import date


current_date = date.today()
y, m, d = str(current_date).split("-")
y = int(y)
m = int(m)
d = int(d)

# do the thing in one month periods
# 02-01 to 01-02
# 02-02 to 01-03
# 02-03 to 01-04
# ...
# 02-11 to 01-12
# 02-12 to 01-01-nextyear

imdb_links_2 = []

# spaghetti code

def generate_imdb_links_by_release_dates(date_range):
    links = []
    for year in range(date_range[0], date_range[1]):
        for i in range(12):
            from_day = 2
            from_month = i + 1
            from_year = year

            to_day = 1
            to_month = i + 2
            to_year = year

            if to_month == m and to_year == y:
                to_day = d

            if to_month == 13:
                to_month = 1
                to_year += 1

            #print(f"{from_day:02d}-{from_month:02d}-{from_year} to {to_day:02d}-{to_month:02d}-{to_year}")
            links.append(f"https://www.imdb.com/search/title/?title_type=feature&release_date={from_year}-{from_month:02d}-{from_day:02d},{to_year}-{to_month:02d}-{to_day:02d}")

            if to_month == m and to_year == y:
                break
    return links

imdb_links_2 = generate_imdb_links_by_release_dates([2020, 2025])

#for link in imdb_links_2:
#    print(link)


In [10]:
# init browser

from math import ceil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

import time 
  
from selenium.webdriver.common.keys import Keys 

browser = webdriver.Chrome()
browser.maximize_window()

In [11]:
def scrape_ids_from_link(browser, link):
    browser.get(link)

    browser.implicitly_wait(10)

    # get the total number of movies from the page
    try:
        element = browser.find_element(By.CLASS_NAME, "ipc-page-grid").find_elements(By.CLASS_NAME, "ipc-page-grid__item")[1]
    except Exception as e:
        logger.info("there are no movies. maybe??")
        #logger.exception(e)
        #logger.info("EXITING")
        #exit(1)
        return -1


    number_of_movies_str = element.text.split("\n")[0].split(" ")[2].replace(",", "")
    #print(number_of_movies_str)
    number_of_movies = int(number_of_movies_str)
    #print(number_of_movies)

    # on one click, the page loads 50 more movies
    number_of_max_clicks = ceil(number_of_movies / 50) - 1

    print("there are " + number_of_movies_str + " movies. we need to click " + str(number_of_max_clicks) + " in total. haha")


    # click until we load all the movies
    # TODO: implement falllbacks on possible errors.
    # TODO: implement sanity check while clicking buttons.

    # b is the "see more" button at the bottom of the page
    b = browser.find_elements(By.CLASS_NAME, "ipc-see-more")
    #print(b)
    for i in range(number_of_max_clicks):
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(2)
        while True:
            try:
                b[0].click()
                break
            except:
                logger.error("something bad happened while clicking. trying again")
                browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
                time.sleep(2)

        time.sleep(2)
        #print("[LOG] " + str(i) + " out of " + str(number_of_max_clicks) + " completed. %" + str(i / number_of_max_clicks))
        logger.info(str(i + 1) + " out of " + str(number_of_max_clicks) + " completed. %" + str((i + 1) / number_of_max_clicks * 100))

    logger.info("done. maybe??")
    return 0

In [12]:
i = 0
final_links = imdb_links_2

for link in final_links:
    i += 1
    link_date = link.split("release_date=")[1]
    links_file = open(f"./imdb_ids/{link_date}.txt", "w")
    
    logger.info(f"START[{i}/{len(final_links)}]! attemting to scrape links for date [{link_date}]")

    ret = 0
    while True:
        try:
            ret = scrape_ids_from_link(browser, link)
            if ret == 0:
                logger.info("clicking completed.")
            break
        except:
            logger.error(f"failed to scrape links for date [{link_date}]. trying again")
            pass
        

    # get all ids from the webpage
    # TODO: remember implementing sanity checks

    # im just assuming this works here
    if ret == 0:
        logger.info("getting all the links and saving them to a file")
        for elem in browser.find_elements(By.CLASS_NAME, "ipc-title-link-wrapper"):
            links_file.write(elem.get_attribute("href"))
            links_file.write("\n")
        logger.info("saved links to file.")
    else:
        logger.info("no links found.")


2024-02-09 16:35:52,474 | INFO | START[1/49]! attemting to scrape links for date [2020-01-02,2020-02-01]
2024-02-09 16:35:52,474 | INFO | START[1/49]! attemting to scrape links for date [2020-01-02,2020-02-01]


there are 1399 movies. we need to click 27 in total. haha
2024-02-09 16:36:02,591 | INFO | 1 out of 27 completed. %3.7037037037037033
2024-02-09 16:36:02,591 | INFO | 1 out of 27 completed. %3.7037037037037033
2024-02-09 16:36:06,843 | INFO | 2 out of 27 completed. %7.4074074074074066
2024-02-09 16:36:06,843 | INFO | 2 out of 27 completed. %7.4074074074074066
2024-02-09 16:36:10,964 | INFO | 3 out of 27 completed. %11.11111111111111
2024-02-09 16:36:10,964 | INFO | 3 out of 27 completed. %11.11111111111111
2024-02-09 16:36:15,148 | INFO | 4 out of 27 completed. %14.814814814814813
2024-02-09 16:36:15,148 | INFO | 4 out of 27 completed. %14.814814814814813
2024-02-09 16:36:19,471 | INFO | 5 out of 27 completed. %18.51851851851852
2024-02-09 16:36:19,471 | INFO | 5 out of 27 completed. %18.51851851851852
2024-02-09 16:36:23,937 | INFO | 6 out of 27 completed. %22.22222222222222
2024-02-09 16:36:23,937 | INFO | 6 out of 27 completed. %22.22222222222222
2024-02-09 16:36:28,283 | INFO | 7 o