**IMDB Web Scraping Project**

Collect the movie tags which contain the details of each movie. IMDb's website is scraped for movies in the genre that the user wants till the user-defined number of movies are attained.

In [26]:
from selenium import webdriver
from selenium.common import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

def get_movie_tags(genre, movie_no):
    driver = webdriver.Chrome()
    driver.maximize_window()
    wait = WebDriverWait(driver, 10)
    short_wait = WebDriverWait(driver, 5)
    collected_movie_tags = []
    collected_movie_html = []

    try:
        start = "https://www.imdb.com/search/title/?genres="
        end = "&explore=genres&title_type=feature"
        driver.get(start + genre + end)

        # collect movies
        iteration = 0
        last_movie_index = 0

        while len(collected_movie_tags) < movie_no:
            displayed_movies = wait.until(EC.visibility_of_all_elements_located(
                (By.XPATH, "//li[contains(@class, 'ipc-metadata-list-summary-item')]")
            ))
            
            for i in range(last_movie_index, len(displayed_movies)):
                el = wait.until(
                    EC.visibility_of_element_located(
                        (By.XPATH, f"(//li[contains(@class, 'ipc-metadata-list-summary-item')])[{i + 1}]")
                    )
                )
                collected_movie_tags.append(el)
                collected_movie_html.append(el.get_attribute('outerHTML'))

                if len(collected_movie_tags) >= movie_no:
                    break

            # click load 50 more (if needed)
            last_movie_index = len(collected_movie_html)
            if last_movie_index < movie_no:
                try:
                    show_more_btn = wait.until(EC.element_to_be_clickable(
                        (By.XPATH, "//button[contains(@class, 'ipc-see-more__button')]")
                    ))
                    driver.execute_script("arguments[0].click();", show_more_btn)
                    wait.until(EC.invisibility_of_element_located(
                        (By.XPATH, "//button[contains(@class, 'ipc-see-more__button') and @disabled]")
                    ))
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                except TimeoutException:
                    pass

            iteration += 1
    
    finally:
        driver.quit()

    return collected_movie_html


These tags are parsed and details of the movie such as its name, year of release, duration, rating etc. are stored in a dictionary.

In [27]:
from bs4 import BeautifulSoup

def convert_duration_to_minutes(duration_str):
    if not duration_str:
        return None
    
    hours, minutes = 0, 0
    parts = duration_str.split()
    
    if 'h' in parts[0]:
        hours = int(parts[0].replace('h', ''))
    if len(parts) > 1 and 'm' in parts[1]:
        minutes = int(parts[1].replace('m', ''))
    
    total_minutes = hours * 60 + minutes
    return total_minutes

def parse_movie_details(movie_html_list):
    movie_details = []
    for movie_html in movie_html_list:
        soup = BeautifulSoup(movie_html, 'html.parser')
        try:
            name = soup.find('h3').text
            name = name.split('. ', 1)[-1]

            metadata_items = soup.find_all('span', class_='sc-b189961a-8 kLaxqf dli-title-metadata-item')
            year = metadata_items[0].text if len(metadata_items) > 0 else None
            duration = metadata_items[1].text if len(metadata_items) > 1 else None
            duration = convert_duration_to_minutes(duration)
            certificate = metadata_items[2].text if len(metadata_items) > 2 else None

            rating_span = soup.find('span', class_='ratingGroup--imdb-rating')
            rating = rating_span.text.strip().split()[0] if rating_span else None

            metascore_span = soup.find('span', class_='sc-b0901df4-0 bcQdDJ metacritic-score-box')
            metascore = metascore_span.text.strip() if metascore_span else None
            
            movie_details.append({
                "name": name,
                "year": year,
                "duration": duration,
                "certificate": certificate,
                "rating": rating,
                "metascore": metascore
            })
        except Exception as e:
            print(f"Error parsing movie details: {e}")
            continue
    return movie_details


The dictionary is stored as a csv file which we can read using Pandas to explore the data 

In [28]:
def write_csv(items, path):
    with open(path, 'w') as f:
        if len(items) == 0:
            return
        
        headers = '|'.join(list(items[0].keys()))
        f.write(headers + '\n')
        
        for item in items:
            values = []
            for value in item.values():
                values.append(str(value))
            f.write('|'.join(values) + '\n')

Let's use these functions to get the data of 200 comedy movies from IMDb

In [29]:
movie_tags_html = get_movie_tags('comedy', 200)

In [30]:
movie_details = parse_movie_details(movie_tags_html[:])

In [31]:
write_csv(movie_details, 'comedy-movies.csv')

In [33]:
import pandas as pd
pd.read_csv('comedy-movies.csv', sep = '|', encoding='latin1').head()

Unnamed: 0,name,year,duration,certificate,rating,metascore
0,Hit Man,2023,115.0,R,7.0,82.0
1,Bad Boys: Ride or Die,2024,115.0,R,7.0,54.0
2,Inside Out 2,2024,96.0,PG,8.0,74.0
3,The Fall Guy,2024,126.0,PG-13,7.0,73.0
4,Inside Out,2015,95.0,PG,8.1,94.0
