In [17]:
# we got the links from year 2000 to 2024, enough for now
# how do we scrape those links one by one

# we can use mechanicalsoup for the actual data scraping,
# since we dont need to interact with any buttons or such


import mechanicalsoup
from requests.adapters import HTTPAdapter
from urllib3 import Retry
import os

mbrowser = mechanicalsoup.StatefulBrowser(
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
    requests_adapters={
        "http": HTTPAdapter(
            max_retries=Retry(total=None, backoff_factor=0.5),
        )
    }
)

#url = "https://www.imdb.com/title/tt0099685"
#urls = [
#    "https://www.imdb.com/title/tt0099685/",
#    "https://www.imdb.com/title/tt1130884",
#    "https://www.imdb.com/title/tt1375666",
#    "https://www.imdb.com/title/tt0468569",
#]


In [18]:
# get the links from a file
#read_links_file = open("./imdb_ids/2000-01-02,2000-02-01.txt", "r")

# get imdb links (ids) from all files
read_links_files = [os.path.join("imdb_ids", file.name) for file in os.scandir("imdb_ids") if file.is_file()]
print(len(read_links_files))
#print(read_links_files)


289


In [21]:
import csv

# TODO: threads brah

for read_links_file_filename in read_links_files:
    read_links_file = open(read_links_file_filename, "r")

    # links inside the read_links_file file (includes imdb links)
    links = [link for link in read_links_file.read().split("\n") if link != ""]

    csv_file_name_date = read_links_file_filename.split("\\")[-1].split(".")[0]
    csv_file = open(f"./csv_files/{csv_file_name_date}.csv", "w", newline="")
    csv_writer = csv.writer(csv_file)
    
    csv_writer.writerow([
        "imdb_id",
        "title",
        "year",
        "rating",
        "time",
        "imdb_rating",
        "metascore",
        "directors",
        "writers",
        "stars"
    ])
    
    
    i = 0
    for url in links:
        #print(f"{csv_file_name_date}: {i} / {len(links)}")
        #print(f"{csv_file_name_date}: {i} / {len(links)}", end="")
    
        while True:
            try:
                page = mbrowser.open(
                    url,
                    timeout=10
                )

                break
            except Exception as e:
                print()
                print(e)
            
            
        
        # original title
        #try:
        #    title = page.soup.find("span", {"class": "hero__primary-text"})
        #    title = title.text
        #except:
        #    title = page.soup.find("meta", {"property": "og:title"})
        #    print(title)
        #    title = title.content
        #    print(title)
        title = page.soup.find("meta", {"property": "og:title"})
        #print(title)
        title = title.get("content").split(" (")[0]

        print(f"{csv_file_name_date}: {i} / {len(links)} | [{title}] {" " * 20}", end="\r")
            
        
        ipc_page_sections = page.soup.findAll("section", {"class": "ipc-page-section"})
        container_element = ipc_page_sections[0].findAll("div", recursive=False)[1]
        
        info_container_element = container_element.findAll("div")[0]
        information_elements_list = info_container_element.findAll("li")
    
        year = information_elements_list[0].text
    
        # age rating
        if len(information_elements_list) < 3: # this means the movie has no age rating
            rating = str(-1)
        else:
            rating = information_elements_list[1].find("a").text
        
        time = information_elements_list[-1].text
            
            
        #imdb_rating_info_container_element = container_element.findAll("div", recursive=False)[1]
        imdb_rating = container_element.find("div", {"class": "rating-bar__base-button"}) # there are four with the same class name, first one contains imdb score
        imdb_rating = imdb_rating.find("a").find("span").text
        #print(imdb_rating)
         
        # never use test ids
        #imdb_rating = page.soup.findAll("div", attrs={"data-testid": "hero-rating-bar__aggregate-rating__score"})[0].find("span").text
    
        # metascore
        # there are movies that has no metascore.
        try:
            metascore = page.soup.findAll("span", {"class": "metacritic-score-box"})[0].text
        except:
            metascore = -1 # metascore not found
        
    
        all_li_elements = ipc_page_sections[0].findAll("li", {"class": "ipc-metadata-list__item"})
        
        directors = []
        writers = []
        stars = []
        
        for li in all_li_elements:
            try:
                t = li.find("span")
                if t is None:
                    t = li.find("a")
                
                t = t.text
    
                if directors == [] and (t == "Director" or t == "Directors"):
                    items = li.find("ul", {"class": "ipc-inline-list"}).findAll("a")
                    directors = [item.text for item in items]
    
                elif writers == [] and (t == "Writer" or t == "Writers"):
                    items = li.find("ul", {"class": "ipc-inline-list"}).findAll("a")
                    writers = [item.text for item in items]
    
                elif stars == [] and (t == "Star" or t == "Stars"):
                    items = li.find("ul", {"class": "ipc-inline-list"}).findAll("a")
                    stars = [item.text for item in items]
    
                else:
                    pass
                
            except Exception as e:
                print(e)
                pass
                
            
        imdb_id = url[27:].split("/")[0]
    
        
        movie_info = [
            imdb_id,
            title,
            str(year),
            str(rating),
            time,
            str(imdb_rating),
            str(metascore),
            ", ".join(directors),
            ", ".join(writers),
            ", ".join(stars),
        ]
    
        movie_info_str = "\n".join(movie_info)
    
        csv_writer.writerow(movie_info)
        csv_file.flush()
        i += 1
    
    csv_file.close()

7.6/10704K,2000-02-01: 0 / 302 | [American Psycho]                     
7.5/1031K2,2000-02-01: 1 / 302 | [You Can Count on Me]                     
5.4/106.4K,2000-02-01: 2 / 302 | [Tra(sgre)dire]                     
6.1/1044K2,2000-02-01: 3 / 302 | [Next Friday]                     

HTTPSConnectionPool(host='www.imdb.com', port=443): Read timed out. (read timeout=10)
7.2/1024K2,2000-02-01: 4 / 302 | [Love & Basketball]                     
5.2/1014K2,2000-02-01: 5 / 302 | [Snow Day]                     
7.0/1056K2,2000-02-01: 6 / 302 | [Boiler Room]                     
4.8/1019K2,2000-02-01: 7 / 302 | [Supernova]                     
7.4/106.2K,2000-02-01: 8 / 302 | [Suzhou he]                     


KeyboardInterrupt: 