In [3]:
# https://www.imdb.com/search/name/?gender=male&sort=birth_date,asc&count=250

In [4]:
### Google colab specific

# import requests
# from bs4 import BeautifulSoup
# import pandas as pd
# import re
# from tqdm import tqdm
# from concurrent.futures import ThreadPoolExecutor
# from google.colab import drive

# def all_imdb_pages(start_url, test_mode=False, max_pages=10):
#     all_urls = []  # list to get all urls
#     url = start_url  # begin page
#     page_count = 0  # initialize page counter

#     # Get total number of names (for non-test mode)
#     if not test_mode:
#         soup = BeautifulSoup(requests.get(url).text, "html.parser")  # parser
#         desc = soup.find('div', class_='desc').get_text()
#         total_names = int(re.search(r"of (\d+)", desc).group(1))  # Extract total number of names
#         max_pages = (total_names // 250) + 1  # Calculate total pages, assuming 250 names per page

#         print(f"total_names: {total_names}")
#         print(f"max_pages: {max_pages}")

#     for _ in tqdm(range(max_pages), desc="Scraping pages"):  # Loop through all the webpages
#         all_urls.append(url)  # add to the list
#         soup = BeautifulSoup(requests.get(url).text, "html.parser")  # parser
        
#         next_links = soup.find_all(class_='lister-page-next next-page')  # Extracts the next page link
#         if len(next_links) == 0:  # If there is no next page, it returns 0
#             url = None
#         else:
#             next_page = "https://www.imdb.com" + next_links[0].get('href')
#             url = next_page
        
#         if url is None or page_count == max_pages:
#             break
#         else:
#             page_count += 1  # increment page counter

#     return all_urls

# def scrape_page(url):
#     names = []
#     links = []
#     ids = []

#     soup = BeautifulSoup(requests.get(url).text, 'html.parser')  # Extracts out the main HTML code
#     lister_items = soup.find_all('div', class_='lister-item-content')  # Get all the containers

#     for item in lister_items:  # loop through all the people in the container to get the attributes
#         name_tag = item.find('h3', class_='lister-item-header').find('a')
#         name = name_tag.text.strip()
#         link = name_tag['href']
#         id = link.split('/')[2]
#         names.append(name)
#         links.append(link)
#         ids.append(id)

#     return names, links, ids

# def get_imdb_info(url, test_mode=False, max_pages=10):
#     all_names = []
#     all_links = []
#     all_ids = []

#     page_urls = all_imdb_pages(url, test_mode, max_pages)  # get all page urls

#     with ThreadPoolExecutor(max_workers=20) as executor:  # Change max_workers to a higher number
#         for names, links, ids in executor.map(scrape_page, page_urls):
#             all_names.extend(names)
#             all_links.extend(links)
#             all_ids.extend(ids)

#     df = pd.DataFrame({
#         'Name': all_names,
#         'Link': all_links,
#         'ID': all_ids
#     })

#     return df


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def all_imdb_pages(start_url, test_mode=False, max_pages=10):
    all_urls = []  # list to get all urls
    url = start_url  # begin page
    page_count = 0  # initialize page counter

    # Get total number of names (for non-test mode)
    if not test_mode:
        soup = BeautifulSoup(requests.get(url).text, "html.parser")  # parser
        desc = soup.find('div', class_='desc').get_text()
        total_names_str = re.search(r"of ([\d,]+)", desc).group(1)  # Extract total number of names
        total_names = int(total_names_str.replace(',', ''))  # Remove comma and convert to int
        max_pages = (total_names // 250) + 1  # Calculate total pages, assuming 250 names per page
        print(f"total_names: {total_names}")
        print(f"max_pages: {max_pages}")

    for _ in tqdm(range(max_pages), desc="Scraping pages"):  # Loop through all the webpages
        all_urls.append(url)  # add to the list
        soup = BeautifulSoup(requests.get(url).text, "html.parser")  # parser
        
        next_links = soup.find_all(class_='lister-page-next next-page')  # Extracts the next page link
        if len(next_links) == 0:  # If there is no next page, it returns 0
            url = None
        else:
            next_page = "https://www.imdb.com" + next_links[0].get('href')
            url = next_page
        
        if url is None or page_count == max_pages:
            break
        else:
            page_count += 1  # increment page counter

    return all_urls

def scrape_page(url):
    names = []
    links = []
    ids = []

    soup = BeautifulSoup(requests.get(url).text, 'html.parser')  # Extracts out the main HTML code
    lister_items = soup.find_all('div', class_='lister-item-content')  # Get all the containers

    for item in lister_items:  # loop through all the people in the container to get the attributes
        name_tag = item.find('h3', class_='lister-item-header').find('a')
        name = name_tag.text.strip()
        link = name_tag['href']
        id = link.split('/')[2]
        names.append(name)
        links.append(link)
        ids.append(id)

    return names, links, ids

def get_imdb_info(url, test_mode=False, max_pages=10):
    all_names = []
    all_links = []
    all_ids = []

    page_urls = all_imdb_pages(url, test_mode, max_pages)  # get all page urls

    with ThreadPoolExecutor(max_workers=20) as executor:  # Change max_workers to a higher number
        for names, links, ids in executor.map(scrape_page, page_urls):
            all_names.extend(names)
            all_links.extend(links)
            all_ids.extend(ids)

    df = pd.DataFrame({
        'Name': all_names,
        'Link': all_links,
        'ID': all_ids
    })

    return df


In [8]:
import os

# Specify the gender
award_type = 'oscar'

# Specify the start URL based on the gender
start_url = f'https://www.imdb.com/search/title/?groups={award_type}_nominee&sort=release_date,asc&count=250'

# Scrape the IMDb info
df = get_imdb_info(start_url, test_mode=False)

# Specify the output path for saving the data
output_folder = 'data'
os.makedirs(output_folder, exist_ok=True)  # Create the directory if it doesn't exist
output_path = os.path.join(output_folder, f'{award_type}_movies.csv')

# Save the data to a CSV file
df.to_csv(output_path, index=False)

print(f"Scraped and saved data for {award_type} movies in the '{output_folder}' folder.")


total_names: 5073
max_pages: 21


Scraping pages:  95%|█████████▌| 20/21 [02:00<00:06,  6.04s/it]


Scraped and saved data for oscar movies in the 'data' folder.
