In [1]:
import os
import datetime
import re

import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random

import warnings

1. Grabbing reviews from Rotten Tomatoes

In [10]:
#dictionary of Oscar awarded movies
movies = {'Everything Everywhere All at Once':'https://www.rottentomatoes.com/m/everything_everywhere_all_at_once/reviews',
          'CODA':'https://www.rottentomatoes.com/m/coda_2021/reviews',
          'Nomadland':'https://www.rottentomatoes.com/m/nomadland/reviews',
          'Parasite':'https://www.rottentomatoes.com/m/parasite_2019/reviews',
          'Greenbook':'https://www.rottentomatoes.com/m/green_book/reviews',
          'The Shape of Water':'https://www.rottentomatoes.com/m/the_shape_of_water_2017/reviews',
          'Moonlight':'https://www.rottentomatoes.com/m/moonlight_2016/reviews',
          'Spotlight':'https://www.rottentomatoes.com/m/spotlight_2015/reviews',
          'Birdman':'https://www.rottentomatoes.com/m/birdman_2014/reviews',
          '12 Years a Slave':'https://www.rottentomatoes.com/m/12_years_a_slave/reviews',
          'Argo':'https://www.rottentomatoes.com/m/argo_2012/reviews',
          'The Artist':'https://www.rottentomatoes.com/m/the_artist/reviews',
          'The Kings Speech':'https://www.rottentomatoes.com/m/the_kings_speech/reviews',
          'The Hurt Locker':'https://www.rottentomatoes.com/m/the_hurt_locker/reviews',
          'Slumdog Millionaire':'https://www.rottentomatoes.com/m/slumdog_millionaire/reviews'}

In [19]:
#dictionary of popular rotten tomato movies
movies = {'Reptile':'https://www.rottentomatoes.com/m/reptile_2023/reviews',
          'The Wonderful Story of Henry Sugar':'https://www.rottentomatoes.com/m/the_wonderful_story_of_henry_sugar/reviews',
          'No One Will Save You':'https://www.rottentomatoes.com/m/no_one_will_save_you/reviews',
          'The Beasts':'https://www.rottentomatoes.com/m/the_beasts/reviews',
          'Talk to Me':'https://www.rottentomatoes.com/m/talk_to_me_2023/reviews',
          'Gran Turismo':'https://www.rottentomatoes.com/m/gran_turismo_based_on_a_true_story/reviews',
          'Blue Beetle':'https://www.rottentomatoes.com/m/blue_beetle/reviews',
          'Bottoms':'https://www.rottentomatoes.com/m/bottoms/reviews',
          'A Million Miles Away':'https://www.rottentomatoes.com/m/a_million_miles_away_2023/reviews',
          'Flora and Son':'https://www.rottentomatoes.com/m/flora_and_son/reviews',
          'Guy Ritchies the Convenant':'https://www.rottentomatoes.com/m/guy_ritchies_the_covenant/reviews',
          'Nowhere':'https://www.rottentomatoes.com/m/nowhere_2023_2/reviews',
          'Barbie':'https://www.rottentomatoes.com/m/barbie/reviews',
          'The Machine':'https://www.rottentomatoes.com/m/the_machine_2023/reviews',
          'Past Lives':'https://www.rottentomatoes.com/m/past_lives/reviews',
          'Stop Making Sense':'https://www.rottentomatoes.com/m/stop_making_sense/reviews',
          'Saw':'https://www.rottentomatoes.com/m/saw/reviews',
          'Megalomaniac':'https://www.rottentomatoes.com/m/megalomaniac/reviews',
          'Elemental':'https://www.rottentomatoes.com/m/elemental_2023/reviews',
          'Spider Man: Across the Spider-Verse':'https://www.rottentomatoes.com/m/spider_man_across_the_spider_verse/reviews'
}

In [20]:
movie_data = []

for movie, url in movies.items():

    response = requests.get(url)

    if response.status_code == 200:

        soup = BeautifulSoup(response.content, 'html.parser')
        critic_reviews_section = soup.find('div', class_='layout reviews-page-container')

        if critic_reviews_section:

            critic_reviews = critic_reviews_section.find_all('div', class_="review-row")

            for review in critic_reviews:
                critic_name = review.find('a', class_='display-name').text.strip()
                critic_rating = review.find('score-icon-critic-deprecated').get('state')
                og_score = review.find('p', class_='original-score-and-url').text.strip()
                elems = og_score.split('Original Score:')
                critic_og_score = elems[1].split()[0] if len(elems) > 1 else ''
                critic_text = review.find('p', class_='review-text').text.strip()
                critic_date = review.find('span', attrs={'data-qa': 'review-date'}).text.strip()

                # create a dictionary for current movie
                movie_entry = {
                    'Movie': movie,
                    'Critic Name': critic_name,
                    'Critic Rating': critic_rating,
                    'Critic Original Score': critic_og_score,
                    'Critic Text': critic_text,
                    'Critic Date': critic_date
                }

                movie_data.append(movie_entry)

        else:
            print(f"No critic reviews found for {movie}.\n")

    else:
        print(f"Failed to retrieve data for {movie}. Status code: {response.status_code}\n")

# create dataframe with all rotten tomato reviews
top_df = pd.DataFrame(movie_data)

1.1 Iterate twice to grab 2 separate dataframes

In [26]:
folder_path = 'datasets'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

csv_file = 'rt_data.csv'
csv_file = os.path.join(folder_path, csv_file)
merge_rt.to_csv(csv_file, index=False)

print(f'DataFrame saved as {csv_file}')

DataFrame saved as datasets/rt_data.csv


2. Combine TMbd movie reviews into one dataframe

In [30]:
import numpy as np

reviews_df = pd.read_csv('/Users/jvo/DataspellProjects/ads509-movie-scrape/datasets/tmdb_df_reviews.csv')
movies_df = pd.read_csv( '/Users/jvo/DataspellProjects/ads509-movie-scrape/datasets/tmdb_df_movies.csv')

merged_df = pd.merge(reviews_df[['movie_id', 'rating', 'content']],
                     movies_df[['id', 'title']],
                     left_on='movie_id', right_on='id',
                     how='inner')

# Rename the columns if needed
merged_df = merged_df.rename(columns={'rating': 'review_rating', 'content': 'review_content', 'title': 'movie_title'})
merged_df['sentiment'] = np.nan
merged_df.head()

Unnamed: 0,movie_id,review_rating,review_content,id,movie_title,sentiment
0,565770,7.0,MORE SPOILER-FREE MINI-REVIEWS @ https://www.m...,565770,Blue Beetle,
1,565770,,"⁃ Some months before release: 'Nah, it looks t...",565770,Blue Beetle,
2,565770,6.0,"Maybe this should just have been called the ""B...",565770,Blue Beetle,
3,565770,7.0,Blue beetle is very awesome!! WoW!👏🏻👏🏻👍🏻👍🏻,565770,Blue Beetle,
4,565770,6.0,The Good: Light-hearted and family centric. Lo...,565770,Blue Beetle,


In [31]:
folder_path = 'datasets'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

csv_file = 'tmbd_data.csv'
csv_file = os.path.join(folder_path, csv_file)
merged_df.to_csv(csv_file, index=False)

print(f'DataFrame saved as {csv_file}')

DataFrame saved as datasets/tmbd_data.csv


3. Combine TMbd and Rotten Tomato reviews

In [51]:
tmbd = pd.read_csv('/Users/jvo/DataspellProjects/ads509-movie-scrape/datasets/tmbd_data.csv')
rt = pd.read_csv('/Users/jvo/DataspellProjects/ads509-movie-scrape/datasets/rt_data.csv')

In [52]:
rt = rt.rename(columns={
    'Movie': 'movie_title',
    'Critic Original Score': 'review_rating',
    'Critic Text': 'review_content',
    'Critic Rating': 'sentiment'
})
tmbd = tmbd[['movie_title', 'review_rating', 'review_content', 'sentiment']]
rt = rt[['movie_title', 'review_rating', 'review_content', 'sentiment']]
all_df = pd.concat([tmbd, rt], ignore_index=True)

4. Pre-processing

In [56]:
all_df = all_df.dropna(subset=['review_rating', 'sentiment'], how='all')
all_df['review_rating'] = pd.to_numeric(all_df['review_rating'], errors='coerce')

subset = all_df[all_df['sentiment'].isna()]

# match tmbd ratings to rotten tomato
def assign_sentiment(rating):
    if rating >= 5:
        return 'fresh'
    else:
        return 'rotten'

subset.loc[:, 'sentiment'] = subset['review_rating'].apply(assign_sentiment)
all_df.loc[subset.index, 'sentiment'] = subset['sentiment']

sentiment_mapping = {'fresh': 'good', 'rotten': 'bad'}
# Use the replace function to replace the values in the 'sentiment' column
all_df['sentiment'] = all_df['sentiment'].replace(sentiment_mapping)

In [57]:
all_csv_file = 'all_tmbd_rt_data.csv'
all_csv_file = os.path.join(folder_path, all_csv_file)
all_df.to_csv(all_csv_file, index=False)
print(f'DataFrame saved as {all_csv_file}')

DataFrame saved as datasets/all_tmbd_rt_data.csv
