In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import os
import json
import re

# Awards

In [2]:
def scrap_awards_movie(metacriticID):
    url = 'https://www.imdb.com/title/{}/awards/'.format(metacriticID)    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    div = soup.find_all('div', attrs={'data-testid':"awards-signpost"})
    if len(div) == 0:
        return 0, 0
    awards = div[0].find_all('div', class_="ipc-signpost__text")[0].get_text(strip=True)
    # extract the number of awards
    # Structure of the text: "N wins & M nominations."
    wins = 0
    nominations = 0
    if len(awards.split('&')) == 2:
        wins = int(awards.split('&')[0].split()[0])
        nominations = int(awards.split('&')[1].split()[0])
    elif "wins" in awards:
        wins = int(awards.split()[0])
    elif "nominations" in awards:
        nominations = int(awards.split()[0])
    return wins, nominations




In [3]:
def scrap_awards(movies_df, save_step=250):
    if not os.path.exists("data/scrap/imdb_awards.csv"):
        imdb_awards = pd.DataFrame(columns=["freebase_id", "nominations", "wins"])
    else:
        imdb_awards = pd.read_csv("data/scrap/imdb_awards.csv")

    already_scraped_movies_ids = imdb_awards["freebase_id"].unique()

    # filter movies that have already been scraped
    movies_df = movies_df.loc[~movies_df["freebase_id"].isin(already_scraped_movies_ids)].reset_index(drop=True)
    
    initial_len = len(imdb_awards)

    for i, row in tqdm(movies_df.iterrows(), total=len(movies_df)):
        imdb_id = row["imdb_id"]
        if not imdb_id:
            print(row)
            continue

        wins, nominations = scrap_awards_movie(imdb_id)

        award = pd.DataFrame({"freebase_id": [row["freebase_id"]], "nominations": [nominations], "wins": [wins]})

        imdb_awards = pd.concat([imdb_awards, award], ignore_index=True)
        if i % save_step == 0:
            imdb_awards.to_csv("data/scrap/imdb_awards.csv", index=False)
            print("Saved {} new awards".format(len(imdb_awards) - initial_len))
            initial_len = len(imdb_awards)

    imdb_awards.to_csv("data/scrap/imdb_awards.csv", index=False)

In [4]:
# Splitting cmu_movies into 4 parts
movies = pd.read_csv("data/cmu_movies.csv", sep="\t")

movies1 = movies.iloc[:int(len(movies)/4)]
movies2 = movies.iloc[int(len(movies)/4):int(len(movies)/2)]
movies3 = movies.iloc[int(len(movies)/2):int(3*len(movies)/4)]
movies4 = movies.iloc[int(3*len(movies)/4):]

movies1.to_csv("data/cmu_movies1.csv", sep="\t", index=False)
movies2.to_csv("data/cmu_movies2.csv", sep="\t", index=False)
movies3.to_csv("data/cmu_movies3.csv", sep="\t", index=False)
movies4.to_csv("data/cmu_movies4.csv", sep="\t", index=False)

In [5]:
movies = pd.read_csv("data/cmu_movies4.csv", sep="\t")
movies = movies.loc[(movies['freebase_id'].notnull()) & (movies['imdb_id'].notnull()) & (movies['metacritic_id'].notnull())] 
scrap_awards(movies, save_step=25)

  0%|          | 0/2164 [00:00<?, ?it/s]

Saved 1 new awards


KeyboardInterrupt: 