In [1]:
# we got the links from year 2000 to 2024, enough for now
# how do we scrape those links one by one

# we can use mechanicalsoup for the actual data scraping,
# since we dont need to interact with any buttons or such


import mechanicalsoup
from requests.adapters import HTTPAdapter
from urllib3 import Retry
import os
import time

mbrowser = mechanicalsoup.StatefulBrowser(
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
    requests_adapters={
        "http": HTTPAdapter(
            max_retries=Retry(total=None, backoff_factor=0.5),
        )
    }
)

#url = "https://www.imdb.com/title/tt0099685"
#urls = [
#    "https://www.imdb.com/title/tt0099685/",
#    "https://www.imdb.com/title/tt1130884",
#    "https://www.imdb.com/title/tt1375666",
#    "https://www.imdb.com/title/tt0468569",
#]


In [2]:
# get the links from a file
#read_links_file = open("./imdb_ids/2000-01-02,2000-02-01.txt", "r")

# get imdb links (ids) from all files
read_links_files = [os.path.join("imdb_ids", file.name) for file in os.scandir("imdb_ids") if file.is_file()]

num_total_movies = 0
for read_links_file in read_links_files:
    read_links_file = open(read_links_file, "r", )
    links = [link for link in read_links_file.read().split("\n") if link != ""]
    num_total_movies += len(links)

print(str(num_total_movies) + " movies in total")

print(str(len(read_links_files)) + " files")
#print(read_links_files)


310610 movies in total
289 files


In [3]:
import threading

# thread safe counter class
class ThreadSafeCounter:
    def __init__(self) -> None:
        self._counter = 0
        self._lock = threading.Lock()

    def increment(self):
        with self._lock:
            self._counter += 1
    
    def value(self):
        with self._lock:
            return self._counter

# TODO: make a chronometer class as well
# we can have a seperate thread that keeps track of the time and estimated time maybe?

# TODO: DO: make a time tracker class so we can see the estimated time while scraping

class ThreadSafeTimer:
    def __init__(self) -> None:
        self._start_time: float = 0.0
        self._stop_time:  float = 0.0

        self._is_stopped = False
        self._lock = threading.Lock()
        
        self.start_time()

    def start_time(self):
        with self._lock:
            self._start_time = time.time()
            self._is_stopped = False
    
    def stop_time(self):
        with self._lock:
            self._stop_time = time.time()
            self._is_stopped = True
    
    def time(self):
        with self._lock:
            if self._is_stopped == False:
                return time.time() - self._start_time
            
            elif self._is_stopped == True:
                return self._stop_time - self._start_time

In [4]:
import csv


# this huge chunk of code takes the filename that includes imdb ids and scrapes movie data from those links one by one
def scrape_movie_data_to_csv(read_links_file_filename, counter_scraped_movies, timer):
    read_links_file = open(read_links_file_filename, "r", encoding="utf-8")

    # links inside the read_links_file file (includes imdb links)
    links = [link for link in read_links_file.read().split("\n") if link != ""]

    csv_file_name_date = read_links_file_filename.split("\\")[-1].split(".")[0]
    csv_file = open(f"./csv_files/{csv_file_name_date}.csv", "w", newline="", encoding="utf-8")
    csv_writer = csv.writer(csv_file)
    
    csv_writer.writerow([
        "imdb_id",
        "title",
        "year",
        "rating",
        "time",
        "imdb_rating",
        "metascore",
        "directors",
        "writers",
        "stars"
    ])
    
    
    i = 0
    for url in links:
        #print(f"{csv_file_name_date}: {i} / {len(links)}")
        #print(f"{csv_file_name_date}: {i} / {len(links)}", end="")
    
        while True:
            try:
                page = mbrowser.open(
                    url,
                    timeout=10
                )

                break
            except Exception as e:
                print()
                print(e)
            
            
        
        # original title
        #try:
        #    title = page.soup.find("span", {"class": "hero__primary-text"})
        #    title = title.text
        #except:
        #    title = page.soup.find("meta", {"property": "og:title"})
        #    print(title)
        #    title = title.content
        #    print(title)
        title = page.soup.find("meta", {"property": "og:title"})
        #print(title)
        title = title.get("content").split(" (")[0]

        #print(f"{csv_file_name_date}: {i} / {len(links)} | [{title}] {" " * 20}", end="\r")
        #print("+", end="")
            
        
        ipc_page_sections = page.soup.findAll("section", {"class": "ipc-page-section"})
        container_element = ipc_page_sections[0].findAll("div", recursive=False)[1]
        
        info_container_element = container_element.findAll("div")[0]
        information_elements_list = info_container_element.findAll("li")
    
        year = information_elements_list[0].text
    
        # age rating
        if len(information_elements_list) < 3: # this means the movie has no age rating
            rating = str(-1)
        else:
            rating = information_elements_list[1].find("a").text
        
        time = information_elements_list[-1].text
            
            
        #imdb_rating_info_container_element = container_element.findAll("div", recursive=False)[1]
        try:
            imdb_rating = container_element.find("div", {"class": "rating-bar__base-button"}) # there are four with the same class name, first one contains imdb score
            imdb_rating = imdb_rating.find("a").find("span").text
            imdb_rating = imdb_rating.split("/")[0]
        except:
            imdb_rating = "null"
        #print(imdb_rating)
         
        # never use test ids
        #imdb_rating = page.soup.findAll("div", attrs={"data-testid": "hero-rating-bar__aggregate-rating__score"})[0].find("span").text
    
        # metascore
        # there are movies that has no metascore.
        try:
            metascore = page.soup.findAll("span", {"class": "metacritic-score-box"})[0].text
        except:
            metascore = -1 # metascore not found
        
    
        all_li_elements = ipc_page_sections[0].findAll("li", {"class": "ipc-metadata-list__item"})
        
        directors = []
        writers = []
        stars = []
        
        for li in all_li_elements:
            try:
                t = li.find("span")
                if t is None:
                    t = li.find("a")
                
                t = t.text
    
                if directors == [] and (t == "Director" or t == "Directors"):
                    items = li.find("ul", {"class": "ipc-inline-list"}).findAll("a")
                    directors = [item.text for item in items]
    
                elif writers == [] and (t == "Writer" or t == "Writers"):
                    items = li.find("ul", {"class": "ipc-inline-list"}).findAll("a")
                    writers = [item.text for item in items]
    
                elif stars == [] and (t == "Star" or t == "Stars"):
                    items = li.find("ul", {"class": "ipc-inline-list"}).findAll("a")
                    stars = [item.text for item in items]
    
                else:
                    pass
                
            except Exception as e:
                print(e)
                pass
                
            
        imdb_id = url[27:].split("/")[0]
    
        
        movie_info = [
            imdb_id,
            title,
            str(year),
            str(rating),
            time,
            str(imdb_rating),
            str(metascore),
            ", ".join(directors),
            ", ".join(writers),
            ", ".join(stars),
        ]
    
        movie_info_str = "\n".join(movie_info)
    
        csv_writer.writerow(movie_info)
        csv_file.flush()

        counter_scraped_movies.increment()
        num_scraped_movies = counter_scraped_movies.value()

        #print(f"{num_scraped_movies}/{num_total_movies} ({(num_scraped_movies/num_total_movies) * 100} %) | estimated remaining time: {(num_scraped_movies / num_total_movies) * (time.time() - start_time)} seconds {" " * 10}", end="\r", flush=True)
        estimated_time = (num_total_movies / num_scraped_movies) * timer.time()
        h = int(estimated_time // 3600)
        m = int((estimated_time % 3600) // 60)
        s = int(estimated_time % 60)
        print(f"{num_scraped_movies}/{num_total_movies} ({(num_scraped_movies/num_total_movies) * 100} %) estimated time: {h}:{m}:{s} {" " * 10}", end="\r", flush=True)
        
        i += 1
    
    csv_file.close()

#for read_links_file_filename in read_links_files:
#    scrape_movie_data_to_csv()


In [5]:
from concurrent.futures import ThreadPoolExecutor

num_cores = os.cpu_count()

counter_scraped_movies = ThreadSafeCounter()
timer = ThreadSafeTimer()

print(f"machine with {num_cores} cores")
with ThreadPoolExecutor(max_workers=num_cores) as executor:
    print(f"starting with {executor._max_workers} concurrent threads")
    for filename in read_links_files:
        executor.submit(scrape_movie_data_to_csv, filename, counter_scraped_movies, timer)

print("bruh")

machine with 12 cores
starting with 12 concurrent threads


27/310610 (0.008692572679566015 %) estimated time: 32.0:29.0:46.804292908433126            