In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import json
import re
import urllib

## Match IMDB, MetaCritic and Freebase IDS

In [None]:
def get_IDS(imdb_ids=[], freebase_ids=[]):
    '''
        Get the imdb_id, freebase_id and metacritic_id from the wikidata database

        Parameters
        ----------
        imdb_ids : str
            The imdb id of the movie
        freebase_id : str
            The freebase id of the movie

        Returns
        -------
        imdb_id : str
            The imdb id of the movie
        freebase_id : str
            The freebase id of the movie
        metacritic_id : str
            The metacritic id of the movie
    '''
    if len(imdb_ids) > 0:
        imdb_ids_string = " ".join(f'"{id_}"' for id_ in imdb_ids)

        query = f"""
            SELECT ?item ?freebaseId ?metacriticId ?imdbId WHERE {{
            VALUES ?imdbId {{ {imdb_ids_string} }}
            ?item wdt:P345 ?imdbId .
            OPTIONAL {{ ?item wdt:P646 ?freebaseId }}
            OPTIONAL {{ ?item wdt:P1712 ?metacriticId }}
            }}
        """

    elif len(freebase_ids) > 0:
        freebase_ids_string = " ".join(f'"{id_}"' for id_ in freebase_ids)

        query = f"""
            SELECT ?item ?freebaseId ?metacriticId ?imdbId WHERE {{
            VALUES ?freebaseId {{ {freebase_ids_string} }}
            ?item wdt:P646 ?freebaseId .
            OPTIONAL {{ ?item wdt:P1712 ?metacriticId }}
            OPTIONAL {{ ?item wdt:P345 ?imdbId }}
            }}
        """

    else:
        return None, None, None
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    encoded_query = urllib.parse.quote(query)
    query_url = f"https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query={encoded_query}"

    response = requests.get(query_url, headers=headers)
    
    if response.status_code != 200:
        return None, None, None
    
    data = response.json()

    if len(data["results"]["bindings"]) == 0:
        return None, None, None
    
    data = data["results"]["bindings"]
    
    imdb_ids = []
    freebase_ids = []
    metacritic_ids = []

    for item in data:
        if "freebaseId" in item:
            freebase_ids.append(item["freebaseId"]["value"])
        else:
            freebase_ids.append(None)
        
        if "metacriticId" in item:
            metacritic_ids.append(item["metacriticId"]["value"])
        else:
            metacritic_ids.append(None)

        if "imdbId" in item:
            imdb_ids.append(item["imdbId"]["value"])
        else:
            imdb_ids.append(None)

    return imdb_ids, freebase_ids, metacritic_ids


In [None]:
def add_imdb_and_metacritics_ids(movies, batch_size=100):

    new_movies = movies.copy()

    # Initializing imdb_id and metacritic_id columns if they don't exist
    if 'imdb_id' not in new_movies.columns:
        new_movies['imdb_id'] = None
    if 'metacritic_id' not in new_movies.columns:
        new_movies['metacritic_id'] = None

    for i in tqdm(range(0, len(new_movies), batch_size)):
        batch = new_movies.iloc[i:i+batch_size]

        # drop columns imdb_id and metacritic_id
        batch = batch.drop(columns=["imdb_id", "metacritic_id"])

        imdb_ids, freebase_ids, metacritic_ids = get_IDS(freebase_ids=batch["freebase_id"].values)

        ids_mapping = pd.DataFrame({"freebase_id": freebase_ids, "imdb_id": imdb_ids, "metacritic_id": metacritic_ids})

        # if duplicates then set imdb and metacritic ids to None
        duplicates = ids_mapping["freebase_id"].duplicated(keep=False)
        ids_mapping.loc[duplicates, ["imdb_id", "metacritic_id"]] = None

        # removing duplicates
        ids_mapping = ids_mapping.drop_duplicates(subset=["freebase_id"])

        # Ensuring one-to-one correspondence
        if not ids_mapping["freebase_id"].is_unique:
            # print duplicates
            raise ValueError("Duplicate freebase_ids found in ids_mapping.")

        # Merging and updating the DataFrame
        batch_updated = batch.merge(ids_mapping, on="freebase_id", how="left")
        new_movies.iloc[i:i+batch_size] = batch_updated

    return new_movies

In [None]:
movies = pd.read_csv('data/preprocessed/movie.metadata.preprocessed.tsv', sep='\t')

cmu_movies = add_imdb_and_metacritics_ids(movies, batch_size=250)

In [None]:
cmu_movies.to_csv('data/processed/cmu_movies.csv', index=False)

## Metacritic Critics Reviews

In [None]:
def scrap_metacritics_movie(metacriticID):

    url = f"https://www.metacritic.com/{metacriticID}/critic-reviews"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    response = requests.get(url, headers=headers)

    soup = BeautifulSoup(response.content, "html.parser")

    script_tag = soup.find('script', string=lambda t: t and 'window.__NUXT__' in t)

    # Define a regular expression pattern to match objects with specific attributes
    pattern = r"\{[^{}]*reviewedProduct:\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}[^{}]*\}"

    if not script_tag:
        return None
    
    # Find all matches of the pattern in the input text
    matches = re.findall(pattern, script_tag.text, re.DOTALL)
    
    def extract_info(data_string):
        
        # Regular expressions for score (next to metaScore), author, and publicationName
        metascore_pattern = r"criticScoreSummary:\{[^\}]*score:(\d+)"
        score_pattern = r'score:([a-zA-Z]|\d+),\s*metaScore'
        author_pattern = r'author:"([^"]+)"'
        publication_name_pattern = r'publicationName:"([^"]+)"'

        # Extracting score
        score_match = re.search(score_pattern, data_string)
        if score_match:
            score = score_match.group(1)
            if score.isalpha():
                score = 0
            else:
                score = int(score)
        else:
            score = None

        # Extracting author
        author_match = re.search(author_pattern, data_string)
        author = author_match.group(1) if author_match else None

        # Extracting publicationName
        publication_name_match = re.search(publication_name_pattern, data_string)
        publication_name = publication_name_match.group(1) if publication_name_match else None

        # Extracting metascore
        metascore_match = re.search(metascore_pattern, data_string)
        metascore = metascore_match.group(1) if metascore_match else None

        return score, author, publication_name, metascore
        
    reviews_data = {"publisher": [], "author": [], "metacritic_rating": [], "metascore": []}
    
    for review in matches:

        score, author, publisher, metascore = extract_info(review)

        reviews_data["publisher"].append(publisher)
        reviews_data["author"].append(author)
        reviews_data["metacritic_rating"].append(score)
        reviews_data["metascore"].append(metascore)

    reviews_df = pd.DataFrame(reviews_data)

    return reviews_df

In [None]:
def scrap_metacritics(movies_df, save_step=250, filepath="data/external/metacritic_reviews.csv"):
    if not os.path.exists(filepath):
        metacritic_reviews = pd.DataFrame(columns=["publisher", "author", "metacritic_rating", "metascore", "metacritic_id"])
    else:
        metacritic_reviews = pd.read_csv(filepath)

    already_scraped_movies_ids = metacritic_reviews["metacritic_id"].unique()

    # filter movies that have already been scraped
    
    movies_df = movies_df.loc[~movies_df["metacritic_id"].isin(already_scraped_movies_ids)]

    movies_df = movies_df.loc[movies_df["metacritic_id"].notna()]

    initial_len = len(metacritic_reviews)

    for i, row in tqdm(movies_df.iterrows(), total=len(movies_df)):
        metacritic_id = row["metacritic_id"]

        if not metacritic_id:
            continue

        reviews = scrap_metacritics_movie(metacritic_id)

        if reviews is None:
            continue

        reviews["metacritic_id"] = row["metacritic_id"]

        metacritic_reviews = pd.concat([metacritic_reviews, reviews], ignore_index=True)

        if i % save_step == 0:
            metacritic_reviews.to_csv(filepath, index=False)
            print("Saved {} new reviews".format(len(metacritic_reviews) - initial_len))
            initial_len = len(metacritic_reviews)

    metacritic_reviews.to_csv(filepath, index=False)


In [89]:
movies = pd.read_csv("data/processed/cmu_movies.csv")

scrap_metacritics(movies, save_step=250, filepath="data/external/metacritic_reviews.csv")

  0%|          | 1/8529 [00:00<39:18,  3.62it/s]

Saved 26 new reviews


  0%|          | 30/8529 [00:31<2:28:14,  1.05s/it]


KeyboardInterrupt: 

## Awards - Wikidata

In [None]:
def get_awards_nominations_batch(freebase_ids):
    formatted_ids = ' '.join(f'"{id_}"' for id_ in freebase_ids)
    sparql_query_awards = f"""
    SELECT ?item ?movieLabel ?movieFreebaseID ?awardLabel WHERE {{
    VALUES ?movieFreebaseID {{ {formatted_ids} }}
    ?item wdt:P646 ?movieFreebaseID .
    OPTIONAL {{ 
      ?item wdt:P166 ?award .
      ?award rdfs:label ?awardLabel .
      FILTER(LANG(?awardLabel) = "en")
    }}
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
  }}
    """
    encoded_query_awards = urllib.parse.quote(sparql_query_awards)

    sparql_query_nominations = f"""
    SELECT ?item ?movieLabel ?movieFreebaseID ?nominationLabel WHERE {{
    VALUES ?movieFreebaseID {{ {formatted_ids} }}
    ?item wdt:P646 ?movieFreebaseID .
    OPTIONAL {{ 
      ?item wdt:P1411 ?nomination .
      ?nomination rdfs:label ?nominationLabel .
      FILTER(LANG(?nominationLabel) = "en")
    }}
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
  }}
    """
    encoded_query_nominations = urllib.parse.quote(sparql_query_nominations)

    url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query={}"
    url_awards = url.format(encoded_query_awards)
    url_nominations = url.format(encoded_query_nominations)
    
    # Fetch awards
    response_awards = requests.get(url_awards)
    if response_awards.status_code == 200:
      data_awards = response_awards.json()

      results_awards = [{
          'freebase_id': item['movieFreebaseID']['value'],
          'type': 'award',
          'name': item['awardLabel']['value']
      } for item in data_awards['results']['bindings'] if 'awardLabel' in item]
    else:
      results_awards = []

    # Fetch nominations
    response_nominations = requests.get(url_nominations)

    if response_nominations.status_code == 200:
      data_nominations = response_nominations.json()
      results_nominations = [{
          'freebase_id': item['movieFreebaseID']['value'],
          'type': 'nomination',
          'name': item['nominationLabel']['value']
      } for item in data_nominations['results']['bindings'] if 'nominationLabel' in item]
    else:
      results_nominations = []

    # Combine results and create DataFrame
    combined_results = results_awards + results_nominations
    return pd.DataFrame(combined_results)

In [None]:
def get_awards_nominations(movies, batch_size=250):
    results = []
    for i in tqdm(range(0, len(movies), batch_size), total=len(movies)//batch_size):
        batch = movies[i:i+batch_size]
        freebase_ids = batch['freebase_id'].tolist()
        results.append(get_awards_nominations_batch(freebase_ids))
    return pd.concat(results)

## Awards - IMDb


In [None]:
def scrap_awards_movie(metacriticID):
    url = 'https://www.imdb.com/title/{}/awards/'.format(metacriticID)    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    div = soup.find_all('div', attrs={'data-testid':"awards-signpost"})
    if len(div) == 0:
        return 0, 0
    awards = div[0].find_all('div', class_="ipc-signpost__text")[0].get_text(strip=True)
    # extract the number of awards
    # Structure of the text: "N wins & M nominations."
    wins = 0
    nominations = 0
    if len(awards.split('&')) == 2:
        wins = int(awards.split('&')[0].split()[0])
        nominations = int(awards.split('&')[1].split()[0])
    elif "wins" in awards:
        wins = int(awards.split()[0])
    elif "nominations" in awards:
        nominations = int(awards.split()[0])
    return wins, nominations

In [5]:
def scrap_awards(movies_df, save_step=250):
    if not os.path.exists("data/external/imdb_awards.csv"):
        imdb_awards = pd.DataFrame(columns=["freebase_id", "nominations", "wins"])
    else:
        imdb_awards = pd.read_csv("data/external/imdb_awards.csv")

    already_scraped_movies_ids = imdb_awards["freebase_id"].unique()

    # filter movies that have already been scraped
    movies_df = movies_df.loc[~movies_df["freebase_id"].isin(already_scraped_movies_ids)].reset_index(drop=True)
    
    initial_len = len(imdb_awards)

    for i, row in tqdm(movies_df.iterrows(), total=len(movies_df)):
        imdb_id = row["imdb_id"]
        if not imdb_id:
            print(row)
            continue

        wins, nominations = scrap_awards_movie(imdb_id)

        award = pd.DataFrame({"freebase_id": [row["freebase_id"]], "nominations": [nominations], "wins": [wins]})

        imdb_awards = pd.concat([imdb_awards, award], ignore_index=True)
        if i % save_step == 0:
            imdb_awards.to_csv("data/external/imdb_awards.csv", index=False)
            print("Saved {} new awards".format(len(imdb_awards) - initial_len))
            initial_len = len(imdb_awards)

    imdb_awards.to_csv("data/external/imdb_awards.csv", index=False)

In [None]:
movies = pd.read_csv("data/processed/cmu_movies.csv", sep="\t")
movies = movies.loc[(movies['freebase_id'].notnull()) & (movies['imdb_id'].notnull()) & (movies['metacritic_id'].notnull())] 
scrap_awards(movies, save_step=25)