In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import json
import re
import urllib

In [2]:
def get_IDS(imdb_ids=[], freebase_ids=[]):
    '''
        Get the imdb_id, freebase_id and metacritic_id from the wikidata database

        Parameters
        ----------
        imdb_ids : str
            The imdb id of the movie
        freebase_id : str
            The freebase id of the movie

        Returns
        -------
        imdb_id : str
            The imdb id of the movie
        freebase_id : str
            The freebase id of the movie
        metacritic_id : str
            The metacritic id of the movie
    '''
    if len(imdb_ids) > 0:
        imdb_ids_string = " ".join(f'"{id_}"' for id_ in imdb_ids)

        query = f"""
            SELECT ?item ?freebaseId ?metacriticId ?imdbId WHERE {{
            VALUES ?imdbId {{ {imdb_ids_string} }}
            ?item wdt:P345 ?imdbId .
            OPTIONAL {{ ?item wdt:P646 ?freebaseId }}
            OPTIONAL {{ ?item wdt:P1712 ?metacriticId }}
            }}
        """

    elif len(freebase_ids) > 0:
        freebase_ids_string = " ".join(f'"{id_}"' for id_ in freebase_ids)

        query = f"""
            SELECT ?item ?freebaseId ?metacriticId ?imdbId WHERE {{
            VALUES ?freebaseId {{ {freebase_ids_string} }}
            ?item wdt:P646 ?freebaseId .
            OPTIONAL {{ ?item wdt:P1712 ?metacriticId }}
            OPTIONAL {{ ?item wdt:P345 ?imdbId }}
            }}
        """

    else:
        return None, None, None
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    encoded_query = urllib.parse.quote(query)
    query_url = f"https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query={encoded_query}"

    response = requests.get(query_url, headers=headers)
    
    if response.status_code != 200:
        return None, None, None
    
    data = response.json()

    if len(data["results"]["bindings"]) == 0:
        return None, None, None
    
    data = data["results"]["bindings"]
    
    imdb_ids = []
    freebase_ids = []
    metacritic_ids = []

    for item in data:
        if "freebaseId" in item:
            freebase_ids.append(item["freebaseId"]["value"])
        else:
            freebase_ids.append(None)
        
        if "metacriticId" in item:
            metacritic_ids.append(item["metacriticId"]["value"])
        else:
            metacritic_ids.append(None)

        if "imdbId" in item:
            imdb_ids.append(item["imdbId"]["value"])
        else:
            imdb_ids.append(None)

    return imdb_ids, freebase_ids, metacritic_ids


## Movies

In [None]:
def scrap_movies():
    url = "https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&start={start}"

    data = {
        "imdb_id": [], 
        "title": [],
        "year": [],
        "duration": [],
        "genres": [],
        "number_of_ratings": [],
        "rating": [],
        "director": [],
        "description": [],
        "freebase_id": [],
        "metacritic_id": [],
        "metascore": [],
    }

    pbar = tqdm(total=1000)

    while True:
        start = len(data["title"]) + 1

        response = requests.get(url.format(start=start))

        soup = BeautifulSoup(response.content, "html.parser")

        movies = soup.find_all(class_="lister-item-content")

        if not movies or len(movies) == 0:
            break

        for movie in movies:
            pbar.update(1)
            imdb_id = movie.find("a")["href"].split("/")[2]
            data["imdb_id"].append(imdb_id)
            data["title"].append(movie.find("a").get_text())

            year_text = movie.find(class_="lister-item-year").get_text().replace("(", "").replace(")", "")
            year_match = re.search(r'\d{4}', year_text)

            if year_match:
                year = year_match.group()
            else:
                year = None
        
            data["year"].append(year)
            data["duration"].append(movie.find(class_="runtime").get_text())
            data["genres"].append(movie.find(class_="genre").get_text().strip())
            data["number_of_ratings"].append(
                movie.find(class_="sort-num_votes-visible").find_all("span")[1].get_text()
            )
            data["rating"].append(movie.find(class_="ratings-imdb-rating").get_text().strip())
            data["director"].append(movie.find_all("p")[2].find("a").get_text())
            data["description"].append(movie.find_all("p", class_="text-muted")[-1].get_text().strip())
            metascore = movie.find("span", class_="metascore")
            if metascore:
                data["metascore"].append(int(metascore.get_text().strip()))
            else:
                data["metascore"].append(None)

            _, freebase_ids, metacritic_ids = get_IDS(imdb_ids=[imdb_id])

            freebase_id = freebase_ids[0] if freebase_ids else None
            metacritic_id = metacritic_ids[0] if metacritic_ids else None

            data["freebase_id"].append(freebase_id)
            data["metacritic_id"].append(metacritic_id)

    df = pd.DataFrame(data)

    pbar.close()

    df["rating"] = df["rating"].astype(float)
    df['number_of_ratings'] = df['number_of_ratings'].str.replace(',', '').astype(int)

    return df

In [None]:
movies = scrap_movies()

In [None]:
movies.to_csv("data/scrap/top1000_IMDB_movies.csv", index=False)

## Reviews

In [None]:
def scrap_movie_reviews(movieID, max_pages=None):
    url = (
        "https://www.imdb.com/title/{movieID}/reviews/_ajax?paginationKey={}"
    )
    key = ""
    data = {"imdb_id": [], "review": [], "rating": [], "date": [], "user": []}

    response = requests.get("https://www.imdb.com/title/{movieID}/reviews".format(movieID = movieID))
    soup = BeautifulSoup(response.content, "html.parser")

    total_reviews = int(soup.find(class_="lister").find(class_="header").find("span").get_text().split()[0].replace(',', ''))
    
    pbar2 = tqdm(total=total_reviews, position=1, leave=True)

    i = 0
    while True:
        i += 1
        if max_pages and i > max_pages:
            break
        response = requests.get(url.format(key, movieID = movieID))
        soup = BeautifulSoup(response.content, "html.parser")
        
        for review, rating, date, user in zip(
            soup.find_all(class_="text show-more__control"), soup.find_all(class_="rating-other-user-rating"), soup.find_all(class_="review-date"), soup.find_all(class_="display-name-link")
        ):
            data["imdb_id"].append(movieID)
            data["review"].append(review.get_text())
            data["rating"].append(rating.find("span").get_text(strip=True))
            data["date"].append(date.get_text(strip=True))
            data["user"].append(user.find("a")["href"].split("/")[2])

        # Find the pagination key
        pagination_key = soup.find("div", class_="load-more-data")
        if not pagination_key:
            break
        
        # Update the `key` variable in-order to scrape more reviews
        key = pagination_key["data-key"]
        
        pbar2.update(25)

    df = pd.DataFrame(data)
    df['rating'] = df['rating'].astype(int)
    
    return df

In [None]:
def scrap_reviews(movies, max_pages=None, reviews_threshold=10000):
    reviews = pd.DataFrame()
    
    files = os.listdir("data/reviews")

    already_scraped_movies = pd.Series()

    if len(files) > 0:
        for file in files:
            file_reviews = pd.read_csv("data/reviews/{}".format(file))
            already_scraped_movies = pd.concat([already_scraped_movies, file_reviews['movie']])

    already_scraped_movies = already_scraped_movies.unique()

    print("Already scraped {}/{} movies".format(len(already_scraped_movies), len(movies)))

    movies = movies[~movies["id"].isin(already_scraped_movies)]

    if len(movies) == 0:
        print("All movies have already been scraped")
        return None

    pbar1 = tqdm(total=len(movies), position=0, leave=True)
    

    for movieID in movies["imdb_id"]:

        print('Scraping reviews for movie {}'.format(movies[movies['imdb_id'] == movieID]['title'].values[0]))

        reviews = pd.concat([reviews, scrap_movie_reviews(movieID, max_pages=max_pages)])

        if len(reviews) > reviews_threshold:
            # create a file "review" + i + ".csv" with i the number of files in the reviews folder
            files = os.listdir("data/reviews")
            file_number = len(files) + 1
            reviews.to_csv("data/reviews/reviews{}.csv".format(file_number), index=False)
            
            print("Scraped {} reviews".format(len(reviews)))
            print("Saved to data/reviews/reviews{}.csv".format(file_number))
            
            # reset the reviews dataframe
            reviews = pd.DataFrame()
        pbar1.update(1)

    # append the reviews to the csv file
    if len(reviews) > 0:
        files = os.listdir("data/reviews")
        file_number = len(files) + 1
        reviews.to_csv("data/reviews/reviews{}.csv".format(file_number), index=False)

    return None

In [None]:
scrap_reviews(movies, max_pages=None, reviews_threshold=20000)

## Casts

In [None]:
def scrap_movie_cast(movieID):
    url = f"https://www.imdb.com/title/{movieID}/fullcredits"

    data = {"imdb_id": [], "actor": [], "character": []}

    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    castlist = soup.find_all("table", class_="cast_list")[0].find_all("tr")

    castlist = castlist[1:]

    for cast in castlist:
        if not("odd" in cast.attrs.get("class", []) or "even" in cast.attrs.get("class", [])):
            break

        data["imdb_id"].append(movieID)
        data["actor"].append(cast.find_all("td")[1].find("a").get_text(strip=True))
        data["character"].append(cast.find("td", class_="character").get_text(strip=True))

    df = pd.DataFrame(data)

    return df



In [None]:
def scrap_casts(movies):
    casts = pd.DataFrame()

    for movieID in tqdm(movies["imdb_id"]):
        casts = pd.concat([casts, scrap_movie_cast(movieID)])
    return casts

In [None]:
casts = scrap_casts(movies)

In [None]:
casts.to_csv("data/casts.csv", index=False)

## Summaries

In [None]:
def scrap_movie_summary(imdb_id):
    url = f"https://www.imdb.com/title/{imdb_id}/plotsummary"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    synopsis = soup.find_all('section', class_="ipc-page-section ipc-page-section--base")[1].find(class_="ipc-metadata-list-item__content-container")
    if synopsis:
        synopsis = synopsis.get_text(strip=True)

    return synopsis

In [None]:
def scrap_summaries(movies):
    summaries = {
        "imdb_id": [],
        "summary": []
    }

    for imdb_id in tqdm(movies["imdb_id"]):
        summary = scrap_movie_summary(imdb_id)

        if summary:
            summaries["imdb_id"].append(imdb_id)
            summaries["summary"].append(summary)

    df = pd.DataFrame(summaries)

    return df

In [None]:
summaries = scrap_summaries(movies)

In [None]:
summaries.to_csv("data/summaries.csv", index=False)

## Critics Reviews

In [15]:
def scrap_metacritics_movie(metacriticID):

    url = f"https://www.metacritic.com/{metacriticID}/critic-reviews"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    response = requests.get(url, headers=headers)

    soup = BeautifulSoup(response.content, "html.parser")

    script_tag = soup.find('script', string=lambda t: t and 'window.__NUXT__' in t)

    # Define a regular expression pattern to match objects with specific attributes
    pattern = r"\{[^{}]*reviewedProduct:\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}[^{}]*\}"

    if not script_tag:
        return None
    
    # Find all matches of the pattern in the input text
    matches = re.findall(pattern, script_tag.text, re.DOTALL)
    
    def extract_info(data_string):
        
        # Regular expressions for score (next to metaScore), author, and publicationName
        metascore_pattern = r"criticScoreSummary:\{[^\}]*score:(\d+)"
        score_pattern = r'score:([a-zA-Z]|\d+),\s*metaScore'
        author_pattern = r'author:"([^"]+)"'
        publication_name_pattern = r'publicationName:"([^"]+)"'

        # Extracting score
        score_match = re.search(score_pattern, data_string)
        if score_match:
            score = score_match.group(1)
            if score.isalpha():
                score = 0
            else:
                score = int(score)
        else:
            score = None

        # Extracting author
        author_match = re.search(author_pattern, data_string)
        author = author_match.group(1) if author_match else None

        # Extracting publicationName
        publication_name_match = re.search(publication_name_pattern, data_string)
        publication_name = publication_name_match.group(1) if publication_name_match else None

        # Extracting metascore
        metascore_match = re.search(metascore_pattern, data_string)
        metascore = metascore_match.group(1) if metascore_match else None

        return score, author, publication_name, metascore
        
    reviews_data = {"publisher": [], "author": [], "metacritic_rating": [], "metascore": []}
    
    for review in matches:

        score, author, publisher, metascore = extract_info(review)

        reviews_data["publisher"].append(publisher)
        reviews_data["author"].append(author)
        reviews_data["metacritic_rating"].append(score)
        reviews_data["metascore"].append(metascore)

    reviews_df = pd.DataFrame(reviews_data)

    return reviews_df

In [18]:
def scrap_metacritics(movies_df, save_step=250):
    if not os.path.exists("data/scrap/metacritic_reviews.csv"):
        metacritic_reviews = pd.DataFrame(columns=["publisher", "author", "metacritic_rating", "metascore", "metacritic_id"])
    else:
        metacritic_reviews = pd.read_csv("data/scrap/metacritic_reviews.csv")

    already_scraped_movies_ids = metacritic_reviews["metacritic_id"].unique()

    # filter movies that have already been scraped
    
    movies_df = movies_df.loc[~movies_df["metacritic_id"].isin(already_scraped_movies_ids)]

    movies_df = movies_df.loc[movies_df["metacritic_id"].notna()]

    initial_len = len(metacritic_reviews)

    for i, row in tqdm(movies_df.iterrows(), total=len(movies_df)):
        metacritic_id = row["metacritic_id"]

        if not metacritic_id:
            continue

        reviews = scrap_metacritics_movie(metacritic_id)

        if reviews is None:
            continue

        reviews["metacritic_id"] = row["metacritic_id"]

        metacritic_reviews = pd.concat([metacritic_reviews, reviews], ignore_index=True)

        if i % save_step == 0:
            metacritic_reviews.to_csv("data/scrap/metacritic_reviews.csv", index=False)
            print("Saved {} new reviews".format(len(metacritic_reviews) - initial_len))
            initial_len = len(metacritic_reviews)

    metacritic_reviews.to_csv("data/scrap/metacritic_reviews.csv", index=False)


In [None]:
movies = pd.read_csv("data/scrap/top1000_IMDB_movies.csv")

scrap_metacritics(movies)

## Match IMDB, MetaCritic and Freebase IDS

In [None]:
def add_imdb_and_metacritics_ids(movies, batch_size=100):

    new_movies = movies.copy()

    # Initializing imdb_id and metacritic_id columns if they don't exist
    if 'imdb_id' not in new_movies.columns:
        new_movies['imdb_id'] = None
    if 'metacritic_id' not in new_movies.columns:
        new_movies['metacritic_id'] = None

    for i in tqdm(range(0, len(new_movies), batch_size)):
        batch = new_movies.iloc[i:i+batch_size]

        # drop columns imdb_id and metacritic_id
        batch = batch.drop(columns=["imdb_id", "metacritic_id"])

        imdb_ids, freebase_ids, metacritic_ids = get_IDS(freebase_ids=batch["freebase_id"].values)

        ids_mapping = pd.DataFrame({"freebase_id": freebase_ids, "imdb_id": imdb_ids, "metacritic_id": metacritic_ids})

        # if duplicates then set imdb and metacritic ids to None
        duplicates = ids_mapping["freebase_id"].duplicated(keep=False)
        ids_mapping.loc[duplicates, ["imdb_id", "metacritic_id"]] = None

        # removing duplicates
        ids_mapping = ids_mapping.drop_duplicates(subset=["freebase_id"])

        # Ensuring one-to-one correspondence
        if not ids_mapping["freebase_id"].is_unique:
            # print duplicates
            raise ValueError("Duplicate freebase_ids found in ids_mapping.")

        # Merging and updating the DataFrame
        batch_updated = batch.merge(ids_mapping, on="freebase_id", how="left")
        new_movies.iloc[i:i+batch_size] = batch_updated

    return new_movies

In [None]:
cmu_movies = pd.read_csv('data/movie.metadata.tsv', sep='\t', header=None, names=['wikipedia_id', 'freebase_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres'])

new_cmu_movies = add_imdb_and_metacritics_ids(cmu_movies, batch_size=250)

In [None]:
new_cmu_movies.to_csv("data/cmu_movies.csv", index=False, sep='\t')

### Scrap Metacritic for the new CMU dataset

In [16]:
new_cmu_movies = pd.read_csv("data/cmu_movies.csv", sep='\t')

In [22]:
scrap_metacritics(new_cmu_movies)

 18%|█▊        | 389/2203 [11:32<43:11,  1.43s/it]  

Saved 26 new reviews


 25%|██▌       | 557/2203 [14:53<35:27,  1.29s/it]

Saved 3238 new reviews


 35%|███▍      | 770/2203 [18:33<25:33,  1.07s/it]

Saved 4141 new reviews


 47%|████▋     | 1032/2203 [22:52<19:24,  1.01it/s]

Saved 4697 new reviews


 53%|█████▎    | 1158/2203 [24:50<20:35,  1.18s/it]

Saved 2151 new reviews


 61%|██████▏   | 1353/2203 [28:05<14:02,  1.01it/s]

Saved 3648 new reviews


 77%|███████▋  | 1703/2203 [33:40<08:56,  1.07s/it]

Saved 6404 new reviews


 87%|████████▋ | 1915/2203 [37:10<04:31,  1.06it/s]

Saved 3958 new reviews


 91%|█████████ | 2003/2203 [38:38<03:23,  1.02s/it]

Saved 1696 new reviews


 92%|█████████▏| 2032/2203 [39:07<03:06,  1.09s/it]

Saved 611 new reviews


100%|██████████| 2203/2203 [42:01<00:00,  1.14s/it]


In [None]:
scrap_metacritics(new_cmu_movies.loc[~(new_cmu_movies["imdb_id"].isna()) & ~(new_cmu_movies["metacritic_id"].isna())], save_step=250)