In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
           'Content-Type': 'text/html; charset=UTF-8'}

parser = 'html.parser'

In [34]:
import Levenshtein

def normalized_levenshtein(s1, s2):
    max_len = max(len(s1), len(s2))
    if max_len == 0:
        return None
    return 1 - (Levenshtein.distance(s1, s2) / max_len)

def get_best_match(soup, target_name, target_year):
    closest_match = None
    best_similarity = -float('inf')
    
    for row in soup.find_all('search-page-media-row'):
        # Extract movie name and release year
        movie_name = row.find('a', {'data-qa': 'info-name'}).get_text().strip()
        release_year = row.get('releaseyear')

        # Calculate absolute year difference
        year_distance = abs(int(target_year) - int(release_year)) if release_year else float('inf')
        
        if year_distance > 3:
            continue
        
        # Compute similarity as a float 
        similarity_threshold = 0.8
        name_similarity = normalized_levenshtein(target_name, movie_name)
        
        # Update closest match if a closer match is found
        if name_similarity > similarity_threshold and name_similarity > best_similarity:
            best_similarity = name_similarity
            closest_match = row
    
    if closest_match:
        match_anchor = closest_match.find('a', {'data-qa': 'thumbnail-link'})
        match_url = match_anchor['href'] if match_anchor else None
    else:
        match_url = None
    
    return match_url

In [37]:
import json

def get_scores(href,soup):
    try:
        # Find the script tag and parse 
        script_tag = soup.find('script', {'id': 'scoreDetails'})
        json_data = json.loads(script_tag.text)
    except:
        print(f'Could not extract JSON-data for href: {href}')
        return None
    
    scoreboard = json_data.get('scoreboard', None)
    if scoreboard:
        # Extract audience-score details
        audience_score = scoreboard.get('audienceScore', None)
        
        audience_banded_rating_count = audience_score.get('bandedRatingCount', None) if audience_score else None
        audience_liked_count = audience_score.get('likedCount', None) if audience_score else None
        audience_not_liked_count = audience_score.get('notLikedCount', None) if audience_score else None
        audience_rating_count = audience_score.get('ratingCount', None) if audience_score else None
        audience_review_count = audience_score.get('reviewCount', None) if audience_score else None
        audience_avg_rating = audience_score.get('averageRating', None) if audience_score else None
        audience_score_rt = audience_score.get('value', None) if audience_score else None

        # Extract tomato-score details
        tomato_score = scoreboard.get('tomatometerScore', None)
        
        tomato_banded_rating_count = tomato_score.get('bandedRatingCount', None) if tomato_score else None
        tomato_liked_count = tomato_score.get('likedCount', None) if tomato_score else None
        tomato_not_liked_count = tomato_score.get('notLikedCount', None) if tomato_score else None
        tomato_rating_count = tomato_score.get('ratingCount', None) if tomato_score else None
        tomato_review_count = tomato_score.get('reviewCount', None) if tomato_score else None
        tomato_avg_rating = tomato_score.get('averageRating', None) if tomato_score else None
        tomato_score_rt = tomato_score.get('value', None) if tomato_score else None

        scores = {'audience_banded_rating_count': audience_banded_rating_count,
                'audience_rating_count': audience_rating_count,
                'audience_liked_count': audience_liked_count,
                'audience_not_liked_count': audience_not_liked_count,
                'audience_review_count': audience_review_count,
                'audience_avg_rating': audience_avg_rating,
                'audience_score': audience_score_rt,
                'tomato_banded_rating_count': tomato_banded_rating_count,
                'tomato_rating_count': tomato_rating_count,
                'tomato_liked_count': tomato_liked_count,
                'tomato_not_liked_count': tomato_not_liked_count,
                'tomato_review_count': tomato_review_count,
                'tomato_avg_rating': tomato_avg_rating,
                'tomato_score': tomato_score_rt
                }
        
        return scores
    else:
        return None


In [61]:
def scrape_page_rt(imdb_id, movie_name, release_year):
    
    search_term = quote(movie_name)
    url = f'https://www.rottentomatoes.com/search?search={search_term}'
    print(url)
    
    def request_soup(url):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
        except:
            print(f'Error occured while scraping {url}')
            return None
        
        return BeautifulSoup(response.text, parser)
    
    search_soup = request_soup(url)
    
    if not search_soup:
        return None

    href = get_best_match(search_soup, movie_name, release_year)
    
    if not href:
        return None
    
    page_soup = request_soup(href)
    json_data = get_scores(href, page_soup)
    
    if not json_data:
        return None
    
    movie_details = {'imdb_id': imdb_id,
                     'movie_name': movie_name,
                     'release_year': release_year,
                     }
    movie_details.update(json_data)
    
    return movie_details
    
imdb_id = 'tt696969'
movie_name = 'John and Mary'
release_year = 1969

movie_details = scrape_page_rt(imdb_id, movie_name, release_year)

movie_details

https://www.rottentomatoes.com/search?search=John%20and%20Mary


{'imdb_id': 'tt696969',
 'movie_name': 'John and Mary',
 'release_year': 1969,
 'audience_banded_rating_count': '500+',
 'audience_rating_count': 506,
 'audience_liked_count': 58,
 'audience_not_liked_count': 53,
 'audience_review_count': 33,
 'audience_avg_rating': '3.4',
 'audience_score': 52,
 'tomato_banded_rating_count': '',
 'tomato_rating_count': 13,
 'tomato_liked_count': 5,
 'tomato_not_liked_count': 8,
 'tomato_review_count': 13,
 'tomato_avg_rating': '3.4',
 'tomato_score': 38}

In [46]:
import pandas as pd

awards_df = pd.read_csv('Output/awards.csv', delimiter=';').reset_index(drop=True)

awards_df

Unnamed: 0.1,Unnamed: 0,year_film,year_award,ceremony,category,win,movie_name,people,country,award,imdb_id,imdb_name,year
0,0,1965,1966,23,Best Screenplay - Motion Picture,True,Doctor Zhivago,['Robert Bolt'],,Golden Globe,tt0059113,Doctor Zhivago,1965
1,1,1965,1966,23,Best Screenplay - Motion Picture,False,The Agony and The Ecstasy,['Philip Dunne'],,Golden Globe,tt0058886,The Agony and the Ecstasy,1965
2,2,1965,1966,23,Best Screenplay - Motion Picture,False,The Collector,['John Kohn'],,Golden Globe,tt0844479,The Collector,1965
3,3,1965,1966,23,Best Screenplay - Motion Picture,False,A Patch of Blue,['Guy Green'],,Golden Globe,tt0059573,A Patch of Blue,1965
4,4,1965,1966,23,Best Screenplay - Motion Picture,False,The Slender Thread,['Stirling Silliphant'],,Golden Globe,tt0059729,The Slender Thread,1965
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,1686,2000,2001,52,Film | Original Screenplay,True,Almost Famous,['Cameron Crowe'],,BAFTA,tt0181875,Almost Famous,2000
1687,1687,2000,2001,52,Film | Original Screenplay,False,Billy Elliot,['Lee Hall'],,BAFTA,tt0249462,Billy Elliot,2000
1688,1688,2000,2001,52,Film | Original Screenplay,False,Erin Brockovich,['Susannah Grant'],,BAFTA,tt0195685,Erin Brockovich,2000
1689,1689,2000,2001,52,Film | Original Screenplay,False,Gladiator,"['David Franzoni', ' John Logan', ' William Ni...",,BAFTA,tt0172495,Gladiator,2000


In [57]:
from concurrent.futures import ThreadPoolExecutor, as_completed

results = []
records = awards_df[['imdb_id','imdb_name','year']].to_dict(orient='records')
failed = 0

with ThreadPoolExecutor(max_workers=14) as executor:
    future_to_record = {executor.submit(scrape_page_rt, record['imdb_id'], record['imdb_name'], record['year']): record for record in records}

    for future in as_completed(future_to_record):
        result = future.result()
        if result is not None:
            results.append(result)
        else:
            failed += 1

# Creating a new DataFrame from the processed data
rottentomatoes_df = pd.DataFrame(results)

https://www.rottentomatoes.com/search?search=Doctor%20Zhivago
https://www.rottentomatoes.com/search?search=The%20Agony%20and%20the%20Ecstasy
https://www.rottentomatoes.com/search?search=The%20Collector
https://www.rottentomatoes.com/search?search=A%20Patch%20of%20Blue
https://www.rottentomatoes.com/search?search=The%20Slender%20Thread
https://www.rottentomatoes.com/search?search=A%20Man%20for%20All%20Seasons
https://www.rottentomatoes.com/search?search=The%20Sand%20Pebbles
https://www.rottentomatoes.com/search?search=Who%27s%20Afraid%20of%20Virginia%20Woolf%3F
https://www.rottentomatoes.com/search?search=Alfie
https://www.rottentomatoes.com/search?search=The%20Russians%20Are%20Coming%20the%20Russians%20Are%20Coming
https://www.rottentomatoes.com/search?search=In%20the%20Heat%20of%20the%20Night
https://www.rottentomatoes.com/search?search=Bonnie%20and%20Clyde
https://www.rottentomatoes.com/search?search=The%20Fox%20and%20the%20Hound
https://www.rottentomatoes.com/search?search=The%20Gra

In [58]:
rottentomatoes_df

Unnamed: 0,imdb_id,movie_name,release_year,audience_banded_rating_count,audience_rating_count,audience_liked_count,audience_not_liked_count,audience_review_count,audience_avg_rating,audience_score,tomato_banded_rating_count,tomato_rating_count,tomato_liked_count,tomato_not_liked_count,tomato_review_count,tomato_avg_rating,tomato_score
0,tt0059729,The Slender Thread,1965,250+,367,46,16,21,3.8,74.0,,7,5,2,7,3.8,71.0
1,tt0094484,In the Heat of the Night,1967,"10,000+",23329,3645,306,1196,4.2,92.0,,90,86,4,90,4.2,96.0
2,tt0058886,The Agony and the Ecstasy,1965,"2,500+",3573,360,100,129,3.8,78.0,,7,6,1,7,3.8,86.0
3,tt0059573,A Patch of Blue,1965,"5,000+",5805,715,72,286,4.3,91.0,,9,8,1,9,4.3,89.0
4,tt0060934,The Sand Pebbles,1966,"5,000+",5669,668,94,264,4.1,88.0,,19,16,3,19,4.1,84.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,tt0203119,Sexy Beast,2000,"25,000+",38732,5207,950,2194,4,85.0,,140,121,19,140,4,86.0
1268,tt0213847,Malèna,2000,"25,000+",29700,4557,838,1318,4,84.0,,78,42,36,78,4,54.0
1269,tt0190590,"O Brother, Where Art Thou?",2000,"250,000+",257029,28570,3642,11525,4.1,89.0,,156,122,34,156,4.1,78.0
1270,tt0172495,Gladiator,2000,"250,000+",34128168,170538,24401,1652336,3.8,87.0,,257,205,52,257,3.8,80.0


In [59]:
!mkdir Output/tomato_scores

rottentomatoes_df.to_csv('Output/tomato_scores/tomato1.csv')