In [8]:
import requests
from bs4 import BeautifulSoup
import random
import time
import json

## First, get genre data from Box Office Mojo's top 200 grossing movies (2025)

### Get the main top 200 page

In [2]:
# getting the first page of 200 movies
page = requests.get("https://www.boxofficemojo.com/chart/top_lifetime_gross/?ref_=bo_cso_ac")
# converting page into readable text file
soup = BeautifulSoup(page.content, "html.parser")
# umcomment to see html as text file
# print(soup.prettify())

### Retrieve the URLs for all of the top 200 movies on Box Office Mojo

In [3]:
# creating an empty list to store individual movies' websites
websites = []

# looping through links on the page to get a list of links for each movie
for link in soup.find_all('a', class_="a-link-normal")[15:415]:
    if "title" in link.get('href'):
       websites.append("https://www.boxofficemojo.com/" + link.get('href')) # concatenating main html link with query string  

### Extracting the raw genre data from the retrieved movie websites

In [None]:
# Title and genre extraction for one website
# Create empty lists for extracted titles and genres (not processed yet)
titles = []
genres = []

# Extract the raw data for each movie
for website in websites:
    print(f"Scraping site {website}")
    page = requests.get(website)
    soup = BeautifulSoup(page.content, "html.parser")
    # Pull the data from the move's website
    # Extract the title
    title = soup.find_all("h1", class_="a-size-extra-large")
    genres_div = soup.find_all('div', class_='a-section a-spacing-none')
    for section in genres_div:
        if "Genres" in section.find_all('span')[0]:
            genres_span = section.find_all('span')[1]

    # Store the extracted title and genres
    titles.append(title[0].text)
    genres.append(genres_span.text)

    # Add a randomized 1-3 second delay to avoid getting blocked by the site
    time.sleep(random.randint(0,3))

print("All done!")

Scraping site https://www.boxofficemojo.com//title/tt2488496/?ref_=bo_cso_table_1
Scraping site https://www.boxofficemojo.com//title/tt4154796/?ref_=bo_cso_table_2
Scraping site https://www.boxofficemojo.com//title/tt10872600/?ref_=bo_cso_table_3
Scraping site https://www.boxofficemojo.com//title/tt0499549/?ref_=bo_cso_table_4
Scraping site https://www.boxofficemojo.com//title/tt1745960/?ref_=bo_cso_table_5
Scraping site https://www.boxofficemojo.com//title/tt1825683/?ref_=bo_cso_table_6
Scraping site https://www.boxofficemojo.com//title/tt1630029/?ref_=bo_cso_table_7
Scraping site https://www.boxofficemojo.com//title/tt4154756/?ref_=bo_cso_table_8
Scraping site https://www.boxofficemojo.com//title/tt0120338/?ref_=bo_cso_table_9
Scraping site https://www.boxofficemojo.com//title/tt0369610/?ref_=bo_cso_table_10
Scraping site https://www.boxofficemojo.com//title/tt22022452/?ref_=bo_cso_table_11
Scraping site https://www.boxofficemojo.com//title/tt6263850/?ref_=bo_cso_table_12
Scraping si

### Processing the genre data and creating a data dictionary

In [None]:
# Create dict for movie titles and genres
movies_and_genres = {}

# Clean up the genres into a list of strings of each genre
for i in range(len(genres)):
    genre = genres[i]
    # Remove the extraneous spaces from the string
    genre_no_space = genre.replace(" ", "")
    # Remove new lines and leave as a space to allow easy splitting into list
    genre_no_break = genre_no_space.replace("\n", " ")
    # Split string into a list
    genre_list = genre_no_break.split()
    # Append the cleaned genre list to the dictionary with the title
    movies_and_genres[titles[i]] = genre_list

{'Star Wars: Episode VII - The Force Awakens (2015)': ['Action', 'Adventure', 'Sci-Fi'], 'Avengers: Endgame (2019)': ['Action', 'Adventure', 'Sci-Fi'], 'Spider-Man: No Way Home (2021)': ['Action', 'Adventure', 'Fantasy', 'Sci-Fi'], 'Avatar (2009)': ['Action', 'Adventure', 'Fantasy', 'Sci-Fi'], 'Top Gun: Maverick (2022)': ['Action', 'Drama'], 'Black Panther (2018)': ['Action', 'Adventure', 'Sci-Fi'], 'Avatar: The Way of Water (2022)': ['Action', 'Adventure', 'Fantasy', 'Sci-Fi'], 'Avengers: Infinity War (2018)': ['Action', 'Adventure', 'Sci-Fi'], 'Titanic (1997)': ['Drama', 'Romance'], 'Jurassic World (2015)': ['Action', 'Adventure', 'Sci-Fi'], 'Inside Out 2 (2024)': ['Adventure', 'Animation', 'Comedy', 'Drama', 'Family', 'Fantasy'], 'Deadpool & Wolverine (2024)': ['Action', 'Adventure', 'Comedy', 'Sci-Fi'], 'Barbie (2023)': ['Adventure', 'Comedy', 'Fantasy'], 'The Avengers (2012)': ['Action', 'Sci-Fi'], 'Star Wars: Episode VIII - The Last Jedi (2017)': ['Action', 'Adventure', 'Fantasy'

### Save the genres dictionary to a .txt file for usage later

In [None]:
# Save the dictionary to a file
with open("title_genres.txt", "w") as title_genres:
    title_genres.write(json.dumps(movies_and_genres))

## Scraping from Rotten Tomatoes for ratings of top 200 movies

### Retrieve the main page first

In [None]:
# getting the first page of 200 movies
page = requests.get("https://editorial.rottentomatoes.com/guide/best-movies-of-all-time/")
# converting page into readable text file
soup = BeautifulSoup(page.content, "html.parser")
# umcomment to see html as text file
# print(soup.prettify())

### Read the ratings data and store it into a dictionary

In [1]:
# creating an empty list to store individual movies
movies = []
titles_ratings = {}
# Loop through each movie row
for row in soup.find_all('tr'):
    movie_data = row.find('p', class_='apple-news-link-wrap movie')
    
    if movie_data:
        title = movie_data.find('a', class_='title').get_text(strip=True)
        score = int(movie_data.find('span', class_='score').strong.get_text(strip=True).strip("%"))
        year = movie_data.find('span', class_='year').get_text(strip=True)
        title = title + " " + year
        # Append data to the list
        titles_ratings[title] = {"rating": score}

# Print results
print(titles_ratings)

NameError: name 'soup' is not defined

### Save the genre dictionary into a .txt file for usage later

In [None]:
# storing movie titles and their ratings as a hashmap in a new file
with open('title_ratings.txt', 'w') as title_ratings: 
     title_ratings.write(json.dumps(titles_ratings))

# Combine the genre data and ratings data into one dictionary for recommendations

### Pull in the titles, genres, and ratings

In [None]:
# Pull in the ratings data first
with open("title_ratings.txt", "r") as title_ratings:
    movie_ratings = json.load(title_ratings)
    print(movie_ratings)

# Pull in the genre data
with open("title_genres.txt", "r") as title_genres:
    movie_genres = json.load(title_genres)

### Merge all of the data into one dictionary

In [None]:
# Container for merged data
ratings_and_genres = {}

for movie_title in movie_genres:
    # movie_title_no_year = movie_title[:-7]
    # Intermediate dict to store the movie's rating and genre
    movie_dict = {}

    # Check if the title is in both data dicts
    if movie_title in movie_ratings:
        # Append the rating data to the movie_dict
        movie_dict["rating"] = movie_ratings[movie_title]['rating']
        movie_dict["genres"] = movie_genres[movie_title]

        # Add the movie to the overall ratings_and_genres
        ratings_and_genres[movie_title] = movie_dict

# storing movie titles and their ratings as a hashmap in a new file
with open('ratings_genres.txt', 'w') as rating_and_genre: 
     rating_and_genre.write(json.dumps(ratings_and_genres))

# TEST print the ratings and genres
print(ratings_and_genres)