In [2]:
# https://www.imdb.com/search/name/?gender=male&sort=birth_date,asc&count=250

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def all_imdb_pages(start_url, test_mode=False, max_pages=10):
    all_urls = []  # list to get all urls
    url = start_url  # begin page
    page_count = 0  # initialize page counter

    # Get total number of names (for non-test mode)
    if not test_mode:
        soup = BeautifulSoup(requests.get(url).text, "html.parser")  # parser
        desc = soup.find('div', class_='desc').get_text()
        total_names_str = re.search(r"of ([\d,]+)", desc).group(1)  # Extract total number of names
        total_names = int(total_names_str.replace(',', ''))  # Remove comma and convert to int
        max_pages = (total_names // 250) + 1  # Calculate total pages, assuming 250 names per page
        print(f"total_names: {total_names}")
        print(f"max_pages: {max_pages}")

    for _ in tqdm(range(max_pages), desc="Scraping pages"):  # Loop through all the webpages
        all_urls.append(url)  # add to the list
        soup = BeautifulSoup(requests.get(url).text, "html.parser")  # parser
        
        next_links = soup.find_all(class_='lister-page-next next-page')  # Extracts the next page link
        if len(next_links) == 0:  # If there is no next page, it returns 0
            url = None
        else:
            next_page = "https://www.imdb.com" + next_links[0].get('href')
            url = next_page
        
        if url is None or page_count == max_pages:
            break
        else:
            page_count += 1  # increment page counter

    return all_urls

def scrape_page(url):
    names = []
    links = []
    ids = []

    soup = BeautifulSoup(requests.get(url).text, 'html.parser')  # Extracts out the main HTML code
    lister_items = soup.find_all('div', class_='lister-item-content')  # Get all the containers

    for item in lister_items:  # loop through all the people in the container to get the attributes
        name_tag = item.find('h3', class_='lister-item-header').find('a')
        name = name_tag.text.strip()
        link = name_tag['href']
        id = link.split('/')[2]
        names.append(name)
        links.append(link)
        ids.append(id)

    return names, links, ids

def get_imdb_info(url, test_mode=False, max_pages=10):
    all_names = []
    all_links = []
    all_ids = []

    page_urls = all_imdb_pages(url, test_mode, max_pages)  # get all page urls

    with ThreadPoolExecutor(max_workers=20) as executor:  # Change max_workers to a higher number
        for names, links, ids in executor.map(scrape_page, page_urls):
            all_names.extend(names)
            all_links.extend(links)
            all_ids.extend(ids)

    df = pd.DataFrame({
        'Name': all_names,
        'Link': all_links,
        'ID': all_ids
    })

    return df

# Specify the gender
gender = 'male'
# gender = 'female'
# gender = 'non-binary'
# gender = 'other'

# Specify the start URL based on the gender
start_url = f'https://www.imdb.com/search/name/?gender={gender}&sort=birth_date,asc&count=250'

# Scrape the IMDb info
df = get_imdb_info(start_url, test_mode=False)

# Specify the output file path for saving the data
output_file = f'{gender}_actors.csv'

# Save the data to a CSV file
df.to_csv(output_file, index=False)

print(f"Scraped and saved data for {gender} actors")


total_names: 4619034
max_pages: 18477


Scraping pages:   0%|          | 3/18477 [00:04<7:06:55,  1.39s/it]


KeyboardInterrupt: 

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def all_imdb_pages(start_url, test_mode=False, max_pages=10):
    all_urls = []  # list to get all urls
    url = start_url  # begin page
    page_count = 0  # initialize page counter

    # Get total number of names (for non-test mode)
    if not test_mode:
        soup = BeautifulSoup(requests.get(url).text, "html.parser")  # parser
        desc = soup.find('div', class_='desc').get_text()
        total_names_str = re.search(r"of ([\d,]+)", desc).group(1)  # Extract total number of names
        total_names = int(total_names_str.replace(',', ''))  # Remove comma and convert to int
        max_pages = (total_names // 250) + 1  # Calculate total pages, assuming 250 names per page
        print(f"total_names: {total_names}")
        print(f"max_pages: {max_pages}")

    for _ in tqdm(range(max_pages), desc="Scraping pages"):  # Loop through all the webpages
        all_urls.append(url)  # add to the list
        soup = BeautifulSoup(requests.get(url).text, "html.parser")  # parser
        
        next_links = soup.find_all(class_='lister-page-next next-page')  # Extracts the next page link
        if len(next_links) == 0:  # If there is no next page, it returns 0
            url = None
        else:
            next_page = "https://www.imdb.com" + next_links[0].get('href')
            url = next_page
        
        if url is None or page_count == max_pages:
            break
        else:
            page_count += 1  # increment page counter

    return all_urls

def scrape_page(url):
    names = []
    links = []
    ids = []

    soup = BeautifulSoup(requests.get(url).text, 'html.parser')  # Extracts out the main HTML code
    lister_items = soup.find_all('div', class_='lister-item-content')  # Get all the containers

    for item in lister_items:  # loop through all the people in the container to get the attributes
        name_tag = item.find('h3', class_='lister-item-header').find('a')
        name = name_tag.text.strip()
        link = name_tag['href']
        id = link.split('/')[2]
        names.append(name)
        links.append(link)
        ids.append(id)

    return names, links, ids

def get_imdb_info(url, test_mode=False, max_pages=10):
    all_names = []
    all_links = []
    all_ids = []

    page_urls = all_imdb_pages(url, test_mode, max_pages)  # get all page urls

    with ThreadPoolExecutor(max_workers=20) as executor:  # Change max_workers to a higher number
        results = list(tqdm(executor.map(scrape_page, page_urls), total=len(page_urls), desc="Scraping actors"))
        for names, links, ids in results:
            all_names.extend(names)
            all_links.extend(links)
            all_ids.extend(ids)

    df = pd.DataFrame({
        'Name': all_names,
        'Link': all_links,
        'ID': all_ids
    })

    return df

# Specify the gender
gender = 'male'
# gender = 'female'
# gender = 'non-binary'
# gender = 'other'

# Specify the start URL based on the gender
start_url = f'https://www.imdb.com/search/name/?gender={gender}&sort=birth_date,asc&count=250'

# Scrape the IMDb info
df = get_imdb_info(start_url, test_mode=False)

# Specify the output file path for saving the data
output_file = f'{gender}_actors.csv'

# Save the data to a CSV file
df.to_csv(output_file, index=False)

print(f"Scraped and saved data for {gender} actors")


total_names: 4619034
max_pages: 18477


Scraping pages:   0%|          | 7/18477 [00:28<20:36:28,  4.02s/it]


KeyboardInterrupt: 

In [12]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import pandas as pd

async def scrape_page(url):
    names = []
    links = []
    ids = []

    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            soup = BeautifulSoup(await response.text(), 'html.parser')
            lister_items = soup.find_all('div', class_='lister-item-content')

            for item in lister_items:
                name_tag = item.find('h3', class_='lister-item-header').find('a')
                name = name_tag.text.strip()
                link = name_tag['href']
                id = link.split('/')[2]
                names.append(name)
                links.append(link)
                ids.append(id)

    return names, links, ids

async def get_imdb_info(url, max_pages=10):
    all_names = []
    all_links = []
    all_ids = []

    tasks = []
    for page in range(1, max_pages + 1):
        page_url = f"{url}&page={page}"
        task = asyncio.ensure_future(scrape_page(page_url))
        tasks.append(task)

    results = await asyncio.gather(*tasks)

    for names, links, ids in results:
        all_names.extend(names)
        all_links.extend(links)
        all_ids.extend(ids)

    df = pd.DataFrame({
        'Name': all_names,
        'Link': all_links,
        'ID': all_ids
    })

    return df


# Specify the gender
gender = 'male'
# gender = 'female'
# gender = 'non-binary'
# gender = 'other'

# Specify the start URL based on the gender
start_url = f'https://www.imdb.com/search/name/?gender={gender}&sort=birth_date,asc&count=250'

# Run the function and get the IMDb info asynchronously
df = asyncio.run(get_imdb_info(start_url, max_pages=10))

# Specify the output file path for saving the data
output_file = f'{gender}_actors.csv'

# Save the data to a CSV file
df.to_csv(output_file, index=False)

print(f"Scraped and saved data for {gender} actors")



RuntimeError: asyncio.run() cannot be called from a running event loop

In [4]:

# Specify the gender
# gender = 'male'
gender = 'female'
# gender = 'non-binary'
# gender = 'other'

# Specify the start URL based on the gender
start_url = f'https://www.imdb.com/search/name/?gender={gender}&sort=birth_date,asc&count=250'

# Scrape the IMDb info
df = get_imdb_info(start_url, test_mode=False)

# Specify the output file path for saving the data
output_file = f'{gender}_actors.csv'

# Save the data to a CSV file
df.to_csv(output_file, index=False)

print(f"Scraped and saved data for {gender} actors")


total_names: 2513828
max_pages: 10056


Scraping pages:   0%|          | 5/10056 [01:19<44:13:36, 15.84s/it]


KeyboardInterrupt: 