# Web scrapping of new data

To enhance the dataset, I chose to scrape additional data from the official IMDb website. The original dataset covered movies up to December 15, 2023, while the new dataset extends from December 16, 2023, to the present day (February 18, 2024).

I extracted the same set of information from the new dataset to maintain consistency and simplify the data cleaning and modification process. Subsequently, I will create a new Jupyter notebook to combine the two datasets and repeat all subsequent steps.

The data contains the following columns:
Moive Name, Rating, Votes, Meta Score, Genre, PG Rating, Year, Duration, Cast, Director

The link for getting data:
Moive Name, Rating, Votes, Meta Score, PG Rating, Year, Duration
https://www.imdb.com/search/title/?title_type=feature&release_date=2023-12-16,&primary_language=en

Other links are created in code to get movies' information of their:
Genre, Cast, Director

*IMDb allows users to use its content for non-personal use.

In [101]:
from bs4 import BeautifulSoup 
import sys
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re
import pandas as pd

In [102]:
# Set up the Selenium webdriver
# we open Google Chrome
driver = webdriver.Chrome()

# give a variable to the link
# here movies after the 16.12.2023 to 18.02.2023 (today)
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2023-12-16,&primary_language=en"
driver.get(url)

In [103]:
def click_show_more():
    # IMDB uses a Dynaic Show More button at the bottom of the page
    # for loading more data, this button needs to be
    # clicked repeatedly to load more HTML data on the page

    try:
        # add a delay before attempting to find the 'Show more' button
        time.sleep(2)
        
        # find the 'Show more' button using its XPath
        # the Xpath is avaible to copy from the Inspect element
        show_more_button = driver.find_element(By.XPATH , "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[2]/div/span/button")
        
        # scroll to the 'Show more' button
        driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
        
        # add a delay after scrolling to allow the screen to load
        time.sleep(2)

        # click the 'Show more' button
        show_more_button.click()
        
        # return True if the button click was successful
        return True
    except Exception as e:
        # print an error message if clicking the 'Show more' button fails
        print(f"Error clicking 'show more': {e}")
        # return False to indicate the failure
        return False

# define the maximum number of times to press the 'Show more' button
num_clicks = 1

# click the "show more" button multiple times
# in our case is 35
for _ in range(num_clicks):
    # call the click_show_more function and break the loop if it returns False
    if not click_show_more():
        break


In [104]:
# the function to extract rating and votes using regular expressions 
# it handles cases where vote counts contain 'K' or 'M' 
# (for thousands and millions), 
# it and returns the extracted values

def get_ratings_votes(exp):
    # Compile a regular expression pattern to match rating and votes
    pattern = re.compile(r'(\d+\.\d+?)\s*\(([^)]+)\)')
    
    # search for the pattern in the input string
    # exp' is a string containing rating and votes in a combined expression
    match = pattern.search(exp)
    
    # define multipliers for handling 'K' and 'M' in vote counts
    multipliers = {"K": 1000, "M": 1000000}
    
    if match:
        # if a match is found:
        if match.group(2)[-1] in multipliers:
            # if the last character of the second group is a multiplier (K or M):
            numeric_part, multiplier = match.group(2)[:-1], match.group(2)[-1] 
            # extract numeric part and multiplier from the second group
            # convert numeric part to float and multiply it with corresponding multiplier
            votes = float(numeric_part) * multipliers[multiplier]
        else:
            # if there's no multiplier, simply convert the second group to float
            votes = float(match.group(2))
        
        # return extracted rating (converted to float) and votes
        return float(match.group(1)), votes
    
    # if no match is found, return "NA" for both rating and votes
    return "NA", "NA"

In [105]:
# importing necessary libraries
from bs4 import BeautifulSoup

# loading HTML data from the current page source
soup = BeautifulSoup(driver.page_source, 'html.parser')

# finding all the movie data elements in the HTML
# classes may change, which could cause issues with the code
# if it doesn't work, check for updates in class names
movie_data = soup.findAll('div', attrs={'class': "ipc-metadata-list-summary-item__c"})

# extracting titles of the movies
titles = soup.findAll('h3', attrs={'class': "ipc-title__text"})
# correcting the extraction of titles
titles = [x.text.split(".")[1:][0].strip() if len(x.text.split(".")) > 1 else 'NA' for x in titles][:-1]

# initializing empty lists to store movie information
ratings = []
votes = []
metascores = []
years = []
durations = [] 
rated = []
links = []

# looping through each movie data element
for id, movie in enumerate(movie_data):
    
    # extracting metadata if available
    metadata_element = movie.find('div', attrs={"class": "sc-43986a27-7 dBkaPT dli-title-metadata"})
    if metadata_element:
        metadata = [x.text for x in metadata_element.find_all('span', class_='fcCUPU dli-title-metadata-item')]
    else:
        metadata = []

    # appending metadata to respective lists
    years.append(metadata[0] if metadata else 'NA')
    durations.append(metadata[1] if len(metadata) > 1 else 'NA')
    rated.append(metadata[2] if len(metadata) > 2 else 'NA')

    # extracting movie link if available, otherwise storing "NA"
    # this is crucial for getting more data about each movie
    link_element = movie.find('a', attrs={"class": "ipc-lockup-overlay ipc-focusable"})
    if link_element:
        link = link_element.get('href')
    else:
        link = 'NA'
    links.append(link)  # storing extracted link


In [106]:
#getting the casts, genres and director_list from the other pages
casts = []
genres = []
director_list = []

# open a new weindow
newdriver = webdriver.Chrome()

#loop through each link to gather additional information
for link in links:
    #check if link is available
    if link != 'NA':
        #navigate to the IMDb page for the movie
        newdriver.get("https://www.imdb.com" + link)
        soup = BeautifulSoup(newdriver.page_source, 'html.parser')
        
        #extract genre information
        genre = soup.find('div', attrs={'class':"ipc-chip-list__scroller"})
        try:
            genres.append(", ".join([x.text for x in genre]))
        except:
            genres.append("NA")
            
        #extract cast information
        cast = soup.findAll('div', attrs={'class': 'sc-bfec09a1-5 hNfYaW' })  
        try:
            casts.append(", ".join([x for x in [x.find('a', attrs={'class': 'sc-bfec09a1-1 gCQkeh'}).text for x in cast][:4]]))
        except:
            casts.append("NA")

        #extract director information
        director = soup.find('div', attrs={'class': 'ipc-metadata-list-item__content-container'}) 
        try:
            director_list.append(director.text) 
        except:
            director_list.append("NA")

    else:
        genres.append("NA")
        casts.append("NA")
        director_list.append("NA")


In [107]:
# create a DataFrame using the extracted movie information
movies_data = pd.DataFrame({
    "Movie Name": titles,        # Column for movie titles
    "Rating": ratings,           # Column for movie ratings
    "Votes": votes,              # Column for number of votes
    "Meta Score": metascores,    # Column for metascore
    "Genre": genres,             # Column for movie genres
    "PG Rating": rated,          # Column for movie ratings (PG)
    "Year": years,               # Column for release year
    "Duration": durations,       # Column for movie duration
    "Cast": casts,               # Column for movie cast
    "Director": director_list   # Column for movie directors
})

# save the DataFrame to a CSV file named "webscrapping_imdb_movie_data_2024.csv"
# saving to the directory
movies_data.to_csv("metadata/imdb_movie_data_2023.csv", index = True)

ValueError: All arrays must be of the same length