In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import os
import json
import re

In [2]:
def get_IDS(imdb_id=None, freebase_id=None):
    '''
        Get the imdb_id, freebase_id and metacritic_id from the wikidata database

        Parameters
        ----------
        imdb_id : str
            The imdb id of the movie
        freebase_id : str
            The freebase id of the movie

        Returns
        -------
        imdb_id : str
            The imdb id of the movie
        freebase_id : str
            The freebase id of the movie
        metacritic_id : str
            The metacritic id of the movie
    '''
    if imdb_id:
        query_url = f"https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT%20%3Fitem%20%3FfreebaseId%20%3FmetacriticId%20%3FimdbId%20WHERE%20%7B%0A%20%20%20%20%3Fitem%20wdt%3AP345%20%22{imdb_id}%22%20.%0A%20%20%20%20OPTIONAL%20%7B%20%3Fitem%20wdt%3AP646%20%3FfreebaseId%20%7D%0A%20%20%20%20OPTIONAL%20%7B%20%3Fitem%20wdt%3AP1712%20%3FmetacriticId%20%7D%0A%20%20%20%20OPTIONAL%20%7B%20%3Fitem%20wdt%3AP345%20%3FimdbId%20%7D%0A%7D"
    elif freebase_id:
        query_url = f"https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT%20%3Fitem%20%3FfreebaseId%20%3FmetacriticId%20%3FimdbId%20WHERE%20%7B%0A%20%20%20%20%3Fitem%20wdt%3AP646%20%22{freebase_id}%22%20.%0A%20%20%20%20OPTIONAL%20%7B%20%3Fitem%20wdt%3AP1712%20%3FmetacriticId%20%7D%0A%20%20%20%20OPTIONAL%20%7B%20%3Fitem%20wdt%3AP345%20%3FimdbId%20%7D%0A%7D"
    else:
        return None, None, None
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }
    response = requests.get(query_url, headers=headers)

    if response.status_code != 200:
        return None, None, None
    
    data = response.json()
    

    if len(data["results"]["bindings"]) == 0:
        return None
    
    data = data["results"]["bindings"][0]
    
    if "freebaseId" in data:
        freebase_id = data["freebaseId"]["value"]
    else:
        freebase_id = None
    
    if "metacriticId" in data:
        metacritic_id = data["metacriticId"]["value"]
    else:
        metacritic_id = None

    if "imdbId" in data:
        imdb_id = data["imdbId"]["value"]
    else:
        imdb_id = None

    return imdb_id, freebase_id, metacritic_id


## Movies

In [36]:
def scrap_movies():
    url = "https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&start={start}"

    data = {
        "imdb_id": [], 
        "title": [],
        "year": [],
        "duration": [],
        "genres": [],
        "number_of_ratings": [],
        "rating": [],
        "director": [],
        "description": [],
        "freebase_id": [],
        "metacritic_id": [],
        "metascore": [],
    }

    pbar = tqdm(total=1000)

    while True:
        start = len(data["title"]) + 1

        response = requests.get(url.format(start=start))

        soup = BeautifulSoup(response.content, "html.parser")

        movies = soup.find_all(class_="lister-item-content")

        if not movies or len(movies) == 0:
            break

        for movie in movies:
            pbar.update(1)
            imdb_id = movie.find("a")["href"].split("/")[2]
            data["imdb_id"].append(imdb_id)
            data["title"].append(movie.find("a").get_text())

            year_text = movie.find(class_="lister-item-year").get_text().replace("(", "").replace(")", "")
            year_match = re.search(r'\d{4}', year_text)

            if year_match:
                year = year_match.group()
            else:
                year = None
        
            data["year"].append(year)
            data["duration"].append(movie.find(class_="runtime").get_text())
            data["genres"].append(movie.find(class_="genre").get_text().strip())
            data["number_of_ratings"].append(
                movie.find(class_="sort-num_votes-visible").find_all("span")[1].get_text()
            )
            data["rating"].append(movie.find(class_="ratings-imdb-rating").get_text().strip())
            data["director"].append(movie.find_all("p")[2].find("a").get_text())
            data["description"].append(movie.find_all("p", class_="text-muted")[-1].get_text().strip())
            metascore = movie.find("span", class_="metascore")
            if metascore:
                data["metascore"].append(int(metascore.get_text().strip()))
            else:
                data["metascore"].append(None)

            _, freebase_id, metacritic_id = get_IDS(imdb_id=imdb_id)

            data["freebase_id"].append(freebase_id)
            data["metacritic_id"].append(metacritic_id)

    df = pd.DataFrame(data)

    pbar.close()

    df["rating"] = df["rating"].astype(float)
    df['number_of_ratings'] = df['number_of_ratings'].str.replace(',', '').astype(int)

    return df

In [39]:
movies = scrap_movies()

  0%|          | 0/1000 [00:00<?, ?it/s]

In [49]:
movies.to_csv("data/scrap/top1000_IMDB_movies.csv", index=False)

  values = values.astype(str)


## Reviews

In [None]:
def scrap_movie_reviews(movieID, max_pages=None):
    url = (
        "https://www.imdb.com/title/{movieID}/reviews/_ajax?paginationKey={}"
    )
    key = ""
    data = {"imdb_id": [], "review": [], "rating": [], "date": [], "user": []}

    response = requests.get("https://www.imdb.com/title/{movieID}/reviews".format(movieID = movieID))
    soup = BeautifulSoup(response.content, "html.parser")

    total_reviews = int(soup.find(class_="lister").find(class_="header").find("span").get_text().split()[0].replace(',', ''))
    
    pbar2 = tqdm(total=total_reviews, position=1, leave=True)

    i = 0
    while True:
        i += 1
        if max_pages and i > max_pages:
            break
        response = requests.get(url.format(key, movieID = movieID))
        soup = BeautifulSoup(response.content, "html.parser")
        
        for review, rating, date, user in zip(
            soup.find_all(class_="text show-more__control"), soup.find_all(class_="rating-other-user-rating"), soup.find_all(class_="review-date"), soup.find_all(class_="display-name-link")
        ):
            data["imdb_id"].append(movieID)
            data["review"].append(review.get_text())
            data["rating"].append(rating.find("span").get_text(strip=True))
            data["date"].append(date.get_text(strip=True))
            data["user"].append(user.find("a")["href"].split("/")[2])

        # Find the pagination key
        pagination_key = soup.find("div", class_="load-more-data")
        if not pagination_key:
            break
        
        # Update the `key` variable in-order to scrape more reviews
        key = pagination_key["data-key"]
        
        pbar2.update(25)

    df = pd.DataFrame(data)
    df['rating'] = df['rating'].astype(int)
    
    return df

In [None]:
def scrap_reviews(movies, max_pages=None, reviews_threshold=10000):
    reviews = pd.DataFrame()
    
    files = os.listdir("data/reviews")

    already_scraped_movies = pd.Series()

    if len(files) > 0:
        for file in files:
            file_reviews = pd.read_csv("data/reviews/{}".format(file))
            already_scraped_movies = pd.concat([already_scraped_movies, file_reviews['movie']])

    already_scraped_movies = already_scraped_movies.unique()

    print("Already scraped {}/{} movies".format(len(already_scraped_movies), len(movies)))

    movies = movies[~movies["id"].isin(already_scraped_movies)]

    if len(movies) == 0:
        print("All movies have already been scraped")
        return None

    pbar1 = tqdm(total=len(movies), position=0, leave=True)
    

    for movieID in movies["imdb_id"]:

        print('Scraping reviews for movie {}'.format(movies[movies['imdb_id'] == movieID]['title'].values[0]))

        reviews = pd.concat([reviews, scrap_movie_reviews(movieID, max_pages=max_pages)])

        if len(reviews) > reviews_threshold:
            # create a file "review" + i + ".csv" with i the number of files in the reviews folder
            files = os.listdir("data/reviews")
            file_number = len(files) + 1
            reviews.to_csv("data/reviews/reviews{}.csv".format(file_number), index=False)
            
            print("Scraped {} reviews".format(len(reviews)))
            print("Saved to data/reviews/reviews{}.csv".format(file_number))
            
            # reset the reviews dataframe
            reviews = pd.DataFrame()
        pbar1.update(1)

    # append the reviews to the csv file
    if len(reviews) > 0:
        files = os.listdir("data/reviews")
        file_number = len(files) + 1
        reviews.to_csv("data/reviews/reviews{}.csv".format(file_number), index=False)

    return None

In [None]:
scrap_reviews(movies, max_pages=None, reviews_threshold=20000)

## Casts

In [None]:
def scrap_movie_cast(movieID):
    url = f"https://www.imdb.com/title/{movieID}/fullcredits"

    data = {"imdb_id": [], "actor": [], "character": []}

    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    castlist = soup.find_all("table", class_="cast_list")[0].find_all("tr")

    castlist = castlist[1:]

    for cast in castlist:
        if not("odd" in cast.attrs.get("class", []) or "even" in cast.attrs.get("class", [])):
            break

        data["imdb_id"].append(movieID)
        data["actor"].append(cast.find_all("td")[1].find("a").get_text(strip=True))
        data["character"].append(cast.find("td", class_="character").get_text(strip=True))

    df = pd.DataFrame(data)

    return df



In [None]:
def scrap_casts(movies):
    casts = pd.DataFrame()

    for movieID in tqdm(movies["imdb_id"]):
        casts = pd.concat([casts, scrap_movie_cast(movieID)])
    return casts

In [None]:
casts = scrap_casts(movies)

In [None]:
casts.to_csv("data/casts.csv", index=False)

## Summaries

In [None]:
def scrap_movie_summary(imdb_id):
    url = f"https://www.imdb.com/title/{imdb_id}/plotsummary"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    synopsis = soup.find_all('section', class_="ipc-page-section ipc-page-section--base")[1].find(class_="ipc-metadata-list-item__content-container")
    if synopsis:
        synopsis = synopsis.get_text(strip=True)

    return synopsis

In [3]:
def scrap_summaries(movies):
    summaries = {
        "imdb_id": [],
        "summary": []
    }

    for imdb_id in tqdm(movies["imdb_id"]):
        summary = scrap_movie_summary(imdb_id)

        if summary:
            summaries["imdb_id"].append(imdb_id)
            summaries["summary"].append(summary)

    df = pd.DataFrame(summaries)

    return df

In [None]:
summaries = scrap_summaries(movies)

In [None]:
summaries.to_csv("data/summaries.csv", index=False)

## Critics Reviews

In [45]:
def scrap_metacritics_movie(metacriticID):

    url = f"https://www.metacritic.com/{metacriticID}/critic-reviews"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    response = requests.get(url, headers=headers)

    soup = BeautifulSoup(response.content, "html.parser")

    script_tag = soup.find('script', text=lambda t: t and 'window.__NUXT__' in t)

    # Define a regular expression pattern to match objects with specific attributes
    pattern = r"\{[^{}]*reviewedProduct:\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}[^{}]*\}"

    # Find all matches of the pattern in the input text
    matches = re.findall(pattern, script_tag.text, re.DOTALL)
    
    def extract_info(data_string):
        
        # Regular expressions for score (next to metaScore), author, and publicationName
        score_pattern = r'score:([a-zA-Z]|\d+),\s*metaScore'
        author_pattern = r'author:"([^"]+)"'
        publication_name_pattern = r'publicationName:"([^"]+)"'

        # Extracting score
        score_match = re.search(score_pattern, data_string)
        if score_match:
            score = score_match.group(1)
            if score.isalpha():
                score = 0
            else:
                score = int(score)
        else:
            score = None

        # Extracting author
        author_match = re.search(author_pattern, data_string)
        author = author_match.group(1) if author_match else None

        # Extracting publicationName
        publication_name_match = re.search(publication_name_pattern, data_string)
        publication_name = publication_name_match.group(1) if publication_name_match else None

        return score, author, publication_name
        
    reviews_data = {"publisher": [], "author": [], "rating": []}
    
    for review in matches:

        score, author, publisher = extract_info(review)

        reviews_data["publisher"].append(publisher)
        reviews_data["author"].append(author)
        reviews_data["rating"].append(score)

    reviews_df = pd.DataFrame(reviews_data)

    return reviews_df

In [46]:
def scrap_metacritics(movies_df):
    if not os.path.exists("data/scrap/metacritic_reviews.csv"):
        metacritic_reviews = pd.DataFrame(columns=["publisher", "author", "rating", "imdb_id", "metacritic_id"])
    else:
        metacritic_reviews = pd.read_csv("data/scrap/metacritic_reviews.csv")

    already_scraped_movies_ids = metacritic_reviews["imdb_id"].unique()

    # filter movies that have already been scraped
    
    movies_df = movies_df.loc[~movies_df["imdb_id"].isin(already_scraped_movies_ids)]

    for _, row in tqdm(movies_df.iterrows(), total=len(movies_df)):
        metacritic_id = row["metacritic_id"]

        if not metacritic_id:
            continue

        reviews = scrap_metacritics_movie(metacritic_id)

        reviews["imdb_id"] = row["imdb_id"]
        reviews["metacritic_id"] = row["metacritic_id"]

        metacritic_reviews = pd.concat([metacritic_reviews, reviews], ignore_index=True)

    metacritic_reviews.to_csv("data/scrap/metacritic_reviews.csv", index=False)


In [37]:
movies = pd.read_csv("data/scrap/top1000_IMDB_movies.csv")

#scrap_metacritics(movies)

In [38]:
movies.head(    )

Unnamed: 0,imdb_id,title,year,duration,genres,number_of_ratings,rating,director,description,freebase_id,metacritic_id,metascore
0,tt0111161,The Shawshank Redemption,1994,142 min,Drama,2818031,9.3,Frank Darabont,"Over the course of several years, two convicts...",/m/07jnt,movie/the-shawshank-redemption,82.0
1,tt0068646,The Godfather,1972,175 min,"Crime, Drama",1963969,9.2,Francis Ford Coppola,"Don Vito Corleone, head of a mafia family, dec...",/m/07g1sm,movie/the-godfather,100.0
2,tt0468569,The Dark Knight,2008,152 min,"Action, Crime, Drama",2799537,9.0,Christopher Nolan,When the menace known as the Joker wreaks havo...,/m/0btpm6,movie/the-dark-knight,84.0
3,tt0108052,Schindler's List,1993,195 min,"Biography, Drama, History",1416535,9.0,Steven Spielberg,"In German-occupied Poland during World War II,...",/m/0hfzr,movie/schindlers-list,95.0
4,tt0167260,The Lord of the Rings: The Return of the King,2003,201 min,"Action, Adventure, Drama",1929790,9.0,Peter Jackson,Gandalf and Aragorn lead the World of Men agai...,/m/017jd9,movie/the-lord-of-the-rings-the-return-of-the-...,94.0


# Awards

In [75]:
def scrap_awards_movie(metacriticID):
    url = 'https://www.imdb.com/title/{}/awards/'.format(metacriticID)    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    div = soup.find_all('div', attrs={'data-testid':"awards-signpost"})
    if len(div) == 0:
        return 0, 0
    awards = div[0].find_all('div', class_="ipc-signpost__text")[0].get_text(strip=True)
    # extract the number of awards
    # Structure of the text: "N wins & M nominations."
    wins = 0
    nominations = 0
    if len(awards.split('&')) == 2:
        wins = int(awards.split('&')[0].split()[0])
        nominations = int(awards.split('&')[1].split()[0])
    elif "wins" in awards:
        wins = int(awards.split()[0])
    elif "nominations" in awards:
        nominations = int(awards.split()[0])
    return wins, nominations




In [82]:
def scrap_awards(movies_df, save_step=250):
    if not os.path.exists("data/scrap/imdb_awards.csv"):
        imdb_awards = pd.DataFrame(columns=["freebase_id", "nominations", "wins"])
    else:
        imdb_awards = pd.read_csv("data/scrap/imdb_awards.csv")

    already_scraped_movies_ids = imdb_awards["freebase_id"].unique()

    # filter movies that have already been scraped
    movies_df = movies_df.loc[~movies_df["freebase_id"].isin(already_scraped_movies_ids)]
    
    initial_len = len(imdb_awards)

    for i, row in tqdm(movies_df.iterrows(), total=len(movies_df)):
        imdb_id = row["imdb_id"]

        if not imdb_id:
            continue

        wins, nominations = scrap_awards_movie(imdb_id)

        award = pd.DataFrame({"freebase_id": [row["freebase_id"]], "nominations": [nominations], "wins": [wins]})

        imdb_awards = pd.concat([imdb_awards, award], ignore_index=True)
        if i % save_step == 0:
            imdb_awards.to_csv("data/scrap/imdb_awards.csv", index=False)
            print("Saved {} new awards".format(len(imdb_awards) - initial_len))
            initial_len = len(imdb_awards)

    imdb_awards.to_csv("data/scrap/imdb_awards.csv", index=False)

In [84]:
movies = pd.read_csv("data/cmu_movies1.csv", sep="\t")
movies = movies.loc[(movies['freebase_id'].notnull()) & (movies['imdb_id'].notnull()) & (movies['metacritic_id'].notnull())] 
scrap_awards(movies, save_step=10)

  0%|          | 0/4288 [00:00<?, ?it/s]

Error: could not extract the number of awards
Saved 13 new awards
Error: could not extract the number of awards
Error: could not extract the number of awards
Saved 22 new awards
Saved 1 new awards
Saved 3 new awards
Error: could not extract the number of awards
Saved 6 new awards
Error: could not extract the number of awards
Saved 4 new awards
Error: could not extract the number of awards


KeyboardInterrupt: 

In [66]:
# Splitting cmu_movies into 4 parts
movies = pd.read_csv("data/cmu_movies.csv", sep="\t")

movies1 = movies.iloc[:int(len(movies)/4)]
movies2 = movies.iloc[int(len(movies)/4):int(len(movies)/2)]
movies3 = movies.iloc[int(len(movies)/2):int(3*len(movies)/4)]
movies4 = movies.iloc[int(3*len(movies)/4):]

movies1.to_csv("data/cmu_movies1.csv", sep="\t", index=False)
movies2.to_csv("data/cmu_movies2.csv", sep="\t", index=False)
movies3.to_csv("data/cmu_movies3.csv", sep="\t", index=False)
movies4.to_csv("data/cmu_movies4.csv", sep="\t", index=False)


In [60]:
# Get the metacritic id of the movie "The Godfather"
metacriticID = 'tt0073864'

scrap_awards_movie(metacriticID)


KeyboardInterrupt: 