In [22]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import os
import json
import re

## Movies

In [None]:
def scrap_movies():
    url = "https://www.imdb.com/search/title/?groups=top_1000&start={start}"

    data = {
        "id": [], 
        "title": [],
        "year": [],
        "duration": [],
        "genres": [],
        "number_of_ratings": [],
        "rating": [],
        "director": [],
        "description": [],
    }

    while True:
        start = len(data["title"]) + 1

        response = requests.get(url.format(start=start))

        soup = BeautifulSoup(response.content, "html.parser")

        movies = soup.find_all(class_="lister-item-content")

        if not movies or len(movies) == 0:
            break

        for movie in movies:
            data["id"].append(movie.find("a")["href"].split("/")[2])
            data["title"].append(movie.find("a").get_text())
            data["year"].append(movie.find(class_="lister-item-year").get_text().replace("(", "").replace(")", ""))
            data["duration"].append(movie.find(class_="runtime").get_text())
            data["genres"].append(movie.find(class_="genre").get_text().strip())
            data["number_of_ratings"].append(
                movie.find(class_="sort-num_votes-visible").find_all("span")[1].get_text()
            )
            data["rating"].append(movie.find(class_="ratings-imdb-rating").get_text().strip())
            data["director"].append(movie.find_all("p")[2].find("a").get_text())
            data["description"].append(movie.find_all("p", class_="text-muted")[-1].get_text().strip())

    df = pd.DataFrame(data)

    df["rating"] = df["rating"].astype(float)
    df['number_of_ratings'] = df['number_of_ratings'].str.replace(',', '').astype(int)

    return df

In [None]:
movies = scrap_movies()

In [None]:
movies.to_csv("data/movies.csv", index=False)

In [None]:
movies = pd.read_csv("data/movies.csv")

## Reviews

In [None]:
def scrap_movie_reviews(movieID, max_pages=None):
    url = (
        "https://www.imdb.com/title/{movieID}/reviews/_ajax?paginationKey={}"
    )
    key = ""
    data = {"movie": [], "review": [], "rating": [], "date": [], "user": []}

    response = requests.get("https://www.imdb.com/title/{movieID}/reviews".format(movieID = movieID))
    soup = BeautifulSoup(response.content, "html.parser")

    total_reviews = int(soup.find(class_="lister").find(class_="header").find("span").get_text().split()[0].replace(',', ''))
    
    pbar2 = tqdm(total=total_reviews, position=1, leave=True)

    i = 0
    while True:
        i += 1
        if max_pages and i > max_pages:
            break
        response = requests.get(url.format(key, movieID = movieID))
        soup = BeautifulSoup(response.content, "html.parser")
        
        for review, rating, date, user in zip(
            soup.find_all(class_="text show-more__control"), soup.find_all(class_="rating-other-user-rating"), soup.find_all(class_="review-date"), soup.find_all(class_="display-name-link")
        ):
            data["movie"].append(movieID)
            data["review"].append(review.get_text())
            data["rating"].append(rating.find("span").get_text(strip=True))
            data["date"].append(date.get_text(strip=True))
            data["user"].append(user.find("a")["href"].split("/")[2])

        # Find the pagination key
        pagination_key = soup.find("div", class_="load-more-data")
        if not pagination_key:
            break
        
        # Update the `key` variable in-order to scrape more reviews
        key = pagination_key["data-key"]
        
        pbar2.update(25)

    df = pd.DataFrame(data)
    df['rating'] = df['rating'].astype(int)
    
    return df

In [None]:
def scrap_reviews(movies, max_pages=None, reviews_threshold=10000):
    reviews = pd.DataFrame()
    
    files = os.listdir("data/reviews")

    already_scraped_movies = pd.Series()

    if len(files) > 0:
        for file in files:
            file_reviews = pd.read_csv("data/reviews/{}".format(file))
            already_scraped_movies = pd.concat([already_scraped_movies, file_reviews['movie']])

    already_scraped_movies = already_scraped_movies.unique()

    print("Already scraped {}/{} movies".format(len(already_scraped_movies), len(movies)))

    movies = movies[~movies["id"].isin(already_scraped_movies)]

    if len(movies) == 0:
        print("All movies have already been scraped")
        return None

    pbar1 = tqdm(total=len(movies), position=0, leave=True)
    

    for movieID in movies["id"]:

        print('Scraping reviews for movie {}'.format(movies[movies['id'] == movieID]['title'].values[0]))

        reviews = pd.concat([reviews, scrap_movie_reviews(movieID, max_pages=max_pages)])

        if len(reviews) > reviews_threshold:
            # create a file "review" + i + ".csv" with i the number of files in the reviews folder
            files = os.listdir("data/reviews")
            file_number = len(files) + 1
            reviews.to_csv("data/reviews/reviews{}.csv".format(file_number), index=False)
            
            print("Scraped {} reviews".format(len(reviews)))
            print("Saved to data/reviews/reviews{}.csv".format(file_number))
            
            # reset the reviews dataframe
            reviews = pd.DataFrame()
        pbar1.update(1)

    # append the reviews to the csv file
    if len(reviews) > 0:
        files = os.listdir("data/reviews")
        file_number = len(files) + 1
        reviews.to_csv("data/reviews/reviews{}.csv".format(file_number), index=False)

    return None

In [None]:
scrap_reviews(movies, max_pages=None, reviews_threshold=20000)

## Casts

In [None]:
def scrap_movie_cast(movieID):
    url = f"https://www.imdb.com/title/{movieID}/fullcredits"

    data = {"movie": [], "actor": [], "character": []}

    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    castlist = soup.find_all("table", class_="cast_list")[0].find_all("tr")

    castlist = castlist[1:]

    for cast in castlist:
        if not("odd" in cast.attrs.get("class", []) or "even" in cast.attrs.get("class", [])):
            break

        data["movie"].append(movieID)
        data["actor"].append(cast.find_all("td")[1].find("a").get_text(strip=True))
        data["character"].append(cast.find("td", class_="character").get_text(strip=True))

    df = pd.DataFrame(data)

    return df



In [None]:
def scrap_casts(movies):
    casts = pd.DataFrame()

    for movieID in tqdm(movies["id"]):
        casts = pd.concat([casts, scrap_movie_cast(movieID)])
    return casts

In [None]:
casts = scrap_casts(movies)

In [None]:
casts.to_csv("data/casts.csv", index=False)

## Summaries

In [None]:
def scrap_movie_summary(movieID):
    url = f"https://www.imdb.com/title/{movieID}/plotsummary"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    synopsis = soup.find_all('section', class_="ipc-page-section ipc-page-section--base")[1].find(class_="ipc-metadata-list-item__content-container")
    if synopsis:
        synopsis = synopsis.get_text(strip=True)

    return synopsis

In [3]:
def scrap_summaries(movies):
    summaries = {
        "movie": [],
        "summary": []
    }

    for movieID in tqdm(movies["id"]):
        summary = scrap_movie_summary(movieID)

        if summary:
            summaries["movie"].append(movieID)
            summaries["summary"].append(summary)

    df = pd.DataFrame(summaries)

    return df

In [None]:
summaries = scrap_summaries(movies)

In [None]:
summaries.to_csv("data/summaries.csv", index=False)

## Critics Reviews

In [67]:
def scrap_metacritics(url):

    user_agent = {'User-agent': 'Mozilla/5.0'}
    response  = requests.get(url, headers = user_agent)
    
    soup = BeautifulSoup(response.content, "html.parser")

    global_score = soup.find(class_="c-ScoreCardLeft_scoreContent_number").get_text(strip=True)
    
    reviews = soup.find_all(class_="c-siteReview_main")

    script_tag = soup.find('script', text=lambda t: t and 'window.__NUXT__' in t)

    # Step 1: Extract the relevant JavaScript object
    match = re.search(r'j\.components\s*=\s*(\[.*?\])\s*;', script_tag.text, re.DOTALL)

    if match:
        components_str = match.group(1)

        # Function to preprocess and fix the JSON-like string
        def preprocess_json_like_string(components_str):
            # Replace JavaScript-specific notations with JSON notations
            print(components_str)
            components_str = components_str.replace("\u002F", "/")  # Replace unicode characters
            print(components_str)
            components_str = re.sub(r'(?<!https:)\"?(\w+)\"?:', r'"\1":', components_str)  # Add quotes to keys
            
            # Add any additional substitutions needed here
            return components_str

        # Preprocess the string
        preprocessed_str = preprocess_json_like_string(components_str)

        print(preprocessed_str)

        # Parse the string as JSON
        try:
            components = json.loads(preprocessed_str)
            print("Components:", components)
            # Extract the reviews
            for component in components:
                if component.get("meta", {}).get("componentDisplayName") == "critic Reviews":
                    reviews = component.get("reviews", [])
                    print("Reviews:", reviews)
                    break
            else:
                print("Component 'critic Reviews' not found")
        except json.JSONDecodeError as e:
            print("Error parsing JSON:", e)
    
    else:
        print("j.components not found in the string")


    reviews_data = {"publisher": [], "author": [], "rating": []}

    for review in reviews:
        score = review.find(class_="c-siteReviewHeader_reviewScore").get_text(strip=True)
        publisher = review.find(class_="c-siteReviewHeader_publisherLogo").get_text(strip=True)
        author = review.find(class_="c-siteReview_criticName").get_text(strip=True).split('By')[1]

        reviews_data["publisher"].append(publisher)
        reviews_data["author"].append(author)
        reviews_data["rating"].append(score)    

    reviews_df = pd.DataFrame(reviews_data)

    return global_score, reviews_df

In [68]:
scrap_metacritics("https://www.metacritic.com/movie/the-dark-knight/critic-reviews/")

[{id:b,meta:{componentName:"product",componentDisplayName:"Product",componentType:"Product"},metadata:{},links:{self:{href:"https:\u002F\u002Ffandom-prod.apigee.net\u002Fv1\u002Fxapi\u002Fmovies\u002Fmetacritic\u002Fthe-dark-knight\u002Fweb?apiKey=1MOZgmNFxvmljaQR1X9KAij9Mo4xAY3u&componentName=product&componentDisplayName=Product&componentType=Product"}},item:{awards:[{awardEvent:"Academy Awards, USA",awardYear:c,wins:2,nominations:8},{awardEvent:"Golden Globes, USA",awardYear:c,wins:1,nominations:1},{awardEvent:"Golden Schmoes Awards",awardYear:c,wins:13,nominations:17}],production:{companies:[{id:4000002663,typeId:3,typeName:"Production Company",name:"Warner Bros.",url:"\u002Fcompany\u002Fwarner-bros\u002F",image:c},{id:4000159111,typeId:3,typeName:"Production Company",name:"Legendary Entertainment",url:"\u002Fcompany\u002Flegendary-entertainment\u002F",image:c},{id:4000147954,typeId:3,typeName:"Production Company",name:"Syncopy",url:"\u002Fcompany\u002Fsyncopy\u002F",image:c},{id:40

  script_tag = soup.find('script', text=lambda t: t and 'window.__NUXT__' in t)


('84',
 Empty DataFrame
 Columns: [publisher, author, rating]
 Index: [])