In [5]:
import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
import pandas as pd
from typing import Optional, List, Dict, Any

In [8]:
def scrape_imdb_info(url: str) -> Optional[pd.DataFrame]:
    headers: Dict[str, str] = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            titles: List[str] = []
            images: List[Optional[str]] = []
            ratings: List[Optional[str]] = []
            years: List[Optional[str]] = []
            genres: List[Optional[str]] = []
            stars_list: List[Optional[List[str]]] = []
            imdb_ids: List[Optional[str]] = []
            episode_titles: List[Optional[str]] = []
            imdb_episode_ids: List[Optional[str]] = []
            tv_ratings: List[Optional[str]] = []

            items = soup.find_all('div', class_='lister-item-content')
            for item in items:
                header = item.find('h3', class_='lister-item-header')
                title_element = header.find('a')
                try:
                    title = title_element.text.strip()
                    imdb_id = title_element['href'].split('/')[-2]
                except:
                    title = imdb_id = None

                # Extracting episode title and IMDb episode ID from the specific structure
                try:
                    episode_tag = item.find('small', class_='text-primary unbold')
                    if episode_tag and episode_tag.text.strip() == 'Episode:':
                        episode_title = episode_tag.find_next_sibling('a').text.strip()
                        episode_href = episode_tag.find_next_sibling('a')['href']
                        imdb_episode_id = episode_href.split('/')[-2] if episode_href else None
                    else:
                        episode_title = imdb_episode_id = None
                except:
                    episode_title = imdb_episode_id = None

                try:
                    image_tag = item.find('div', class_='lister-item-image').find('img')
                    image_url = image_tag['loadlate'] if 'loadlate' in image_tag.attrs else None
                except Exception as e:
                    #print("An error occurred while extracting image URL:", e)
                    image_url = None

                try:
                    rating = item.find('div', class_='ratings-imdb-rating').strong.text.strip()
                except:
                    rating = None

                try:
                    year = item.find('span', class_='lister-item-year').text.strip()
                except:
                    year = None

                try:
                    genre = item.find('span', class_='genre').text.strip()
                except:
                    genre = None

                try:
                    stars = [star.text for star in item.select('p.text-muted.text-small a[href^="/name/"]')]
                except:
                    stars = None

                try:
                    tv_rating_element = item.find('span', class_='certificate')
                    tv_rating = tv_rating_element.text.strip() if tv_rating_element else None
                except:
                    tv_rating = None

                titles.append(title)
                images.append(image_url)
                ratings.append(rating)
                years.append(year)
                genres.append(genre)
                stars_list.append(stars)
                imdb_ids.append(imdb_id)
                episode_titles.append(episode_title)
                imdb_episode_ids.append(imdb_episode_id)
                tv_ratings.append(tv_rating)

            # Create a DataFrame
            df = pd.DataFrame({
                'Title': titles,
                'Image_URL': images,
                'Rating': ratings,
                'Year': years,
                'Genre': genres,
                'Stars': stars_list,
                'IMDb_ID': imdb_ids,
                'Episode_Title': episode_titles,
                'IMDb_Episode_ID': imdb_episode_ids,
                'TV_Rating': tv_ratings
            })

            return df
        else:
            print("Failed to retrieve the page.")
    except Exception as e:
        print("An error occurred:", e)

    return None

In [9]:
url = "https://www.imdb.com/search/keyword/?ref_=kw_ref_rt_usr&mode=detail&page=1&genres=Animation&sort=release_date,desc&user_rating=9.8%2C9.9"
scrape_imdb_info(url)

Unnamed: 0,Title,Image_URL,Rating,Year,Genre,Stars,IMDb_ID,Episode_Title,IMDb_Episode_ID,TV_Rating
0,One Piece,,9.9,(1999– ),"Animation, Action, Adventure","[Tasuku Shimaya, Hiroaki Hirata, Katsuhisa Hôk...",tt0388629,A Forbidden Piece of History! A Theory Concern...,tt31495236,TV-14
1,Cosmic Dawn - Elena the shard of innocence,,9.9,(2022– ),"Animation, Action, Adventure","[Damon Alums, Ben Chaverin, Curtis Combrink, D...",tt12740628,Cosmic Light,tt12788750,
2,You Have Got This,,9.8,(2024– ),"Animation, Drama","[Barbara Jones, Amelia Winston, Celina Jackson]",tt30633107,,,
3,The Adventures of Ping and Roar,,9.8,(2023– ),Animation,"[Darren Marlar, Oliver Lucas, McKenna Sawrey, ...",tt30135648,,,
4,Jujutsu Kaisen,,9.8,(2020– ),"Animation, Action, Adventure","[Hakuyu Go, Yamazaki Harumi, Itsuki Tsuchigami...",tt12343534,Thunderclap - Part 2,tt29621867,TV-MA
5,Jujutsu Kaisen,,9.8,(2020– ),"Animation, Action, Adventure","[Itsuki Tsuchigami, Shouta Goshozono, Ryan Bar...",tt12343534,Thunderclap,tt29621863,TV-MA
6,Hero Inside,,9.9,(2023– ),"Animation, Action, Comedy","[James Brown Jr., Samantha Cooper, Barrett Led...",tt24165980,Villains vs. Heroes,tt30743722,
7,Underverse,,9.9,(2016– ),"Animation, Action, Fantasy",[Jael Peñaloza],tt15759044,Underverse 0.7 Part 2,tt18547968,
8,Hello Neighbor: Welcome to Raven Brooks,,9.9,(2020– ),"Animation, Horror, Mystery","[Kimberly Woods, Armen Taylor, Kieran Walton, ...",tt23901812,Search & Rescue,tt29576465,TV-PG
9,L'il Stompers,,9.9,(2023– ),Animation,"[Chance Orion Wood, Thea Richardson, Beaux Roz...",tt21227790,,,TV-Y


In [None]:
# List to store DataFrames
all_dataframes_2000_2024 = []

# Starting user ratings
user_rating_high, user_rating_low = 9.9, 9.8

# Iterate through different user ratings
while user_rating_high >= 0.1 and user_rating_low >= 0.0:
    page_number = 1
    while True:
        # URL with the specified user rating range and page number
        url = f"https://www.imdb.com/search/keyword/?ref_=kw_ref_rt_usr&mode=detail&page={page_number}&genres=Animation&sort=release_date,desc&user_rating={user_rating_high}%2C{user_rating_low}&sort=release_date,desc&release_date=2000%2C2024"
        
        try:
            df = scrape_imdb_info(url)
            if not df.empty:
                all_dataframes_2000_2024.append(df)
                page_number += 1
            else:
                print(f"Done with user_rating_high {user_rating_high} user_rating_low {user_rating_low}. Reached page {page_number-1}")
                break
        except Exception as e:
            print(f"Error for user ratings {user_rating_high}, {user_rating_low}, page {page_number}: {e}")
            break
    
    # Print the number of pages reached
    print(f"Finished all pages for user_rating_high {user_rating_high} user_rating_low {user_rating_low}")

    # Decrease both user ratings by 0.1
    user_rating_high -= 0.1
    user_rating_low -= 0.1

In [None]:
# List to store DataFrames
all_dataframes_1900_2000 = []

# Starting user ratings
user_rating_high, user_rating_low = 9.9, 9.8

# Iterate through different user ratings
while user_rating_high >= 0.1 and user_rating_low >= 0.0:
    page_number = 1
    while True:
        # URL with the specified user rating range and page number
        url = f"https://www.imdb.com/search/keyword/?ref_=kw_ref_rt_usr&mode=detail&page={page_number}&genres=Animation&sort=release_date,desc&user_rating={user_rating_high}%2C{user_rating_low}&sort=release_date,desc&release_date=1900%2C2000"
        
        try:
            df = scrape_imdb_info(url)
            if not df.empty:
                all_dataframes_1900_2000.append(df)
                page_number += 1
            else:
                print(f"Done with user_rating_high {user_rating_high} user_rating_low {user_rating_low}. Reached page {page_number-1}")
                break
        except Exception as e:
            print(f"Error for user ratings {user_rating_high}, {user_rating_low}, page {page_number}: {e}")
            break
    
    # Print the number of pages reached
    print(f"Finished all pages for user_rating_high {user_rating_high} user_rating_low {user_rating_low}")

    # Decrease both user ratings by 0.1
    user_rating_high -= 0.1
    user_rating_low -= 0.1

In [None]:
merged_df = pd.concat([all_dataframes_1900_2000,all_dataframes_2000_2024], ignore_index=True)