## Data scraping: completing the dataset with additional information

#### Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator

import requests
import cpi
import re

import ast

In [1]:
import dataframes as RAW
import cleaning as CLEAN

#### Functions to extract ratings, revenues and awards (wins/nominations)

In order to extract the IMDb ratings, box-office revenues and awards won of our movies, we will use the API called OMDb. The following functions have the role of requesting the data of the movie we give as an argument:

In [3]:
def get_movie_rating(title):
    omdb_api_key = "46e03b67"
    omdb_base_url = "http://www.omdbapi.com/"

    params = {
        "apikey": omdb_api_key,
        "t": title,
    }

    response = requests.get(omdb_base_url, params=params)
    data = response.json()

    if "Error" in data:
        print(f"Error: {data['Error']}")
        return None

    rating = data.get("imdbRating")
    if rating:
        return rating
    else:
        print(f"Rating not found for {title}")
        return None

# Example usage
movie_title = "Taxi Driver"
rating = get_movie_rating(movie_title)

if rating:
    print(f"The IMDb rating for {movie_title} is {rating}")

The IMDb rating for Taxi Driver is 8.2


Since the revenue scraping yields a string, we had to create an additional function that takes a string such as `"$80,000,000"` and returns its actual number value, in this case `80000000.0`:

In [4]:
def money_string_to_number(money_string):
    # Define a set of valid characters (digits, dot for decimal point, comma for thousands separator, and dollar sign)
    valid_characters = set("0123456789.,$")

    # Remove invalid characters from the money string
    cleaned_money_string = ''.join(char for char in money_string if char in valid_characters)

    # Replace commas with an empty string and remove any leading dollar signs
    cleaned_money_string = cleaned_money_string.replace(',', '').lstrip('$')

    # Convert the cleaned string to a float
    try:
        money_number = float(cleaned_money_string)
        return money_number
    except ValueError:
        print(f"Error: Unable to convert '{money_string}' to a number.")
        return None

def get_movie_revenue(title):
    omdb_api_key = "46e03b67"
    omdb_base_url = "http://www.omdbapi.com/"

    params = {
        "apikey": omdb_api_key,
        "t": title,
    }

    response = requests.get(omdb_base_url, params=params)
    data = response.json()

    if "Error" in data:
        print(f"Error: {data['Error']}")
        return None

    revenue = data.get("BoxOffice")
    if revenue:
        return money_string_to_number(revenue)
    else:
        print(f"Revenue not found for {title}")
        return None

# Example usage
movie_title = "Taxi Driver"
revenue = get_movie_revenue(movie_title)

if revenue:
    print(f"The box-office revenue for {movie_title} is ${revenue}")

The box-office revenue for Taxi Driver is $28262574.0


When scraping for awards, we had to find a way to extract the number of wins and nominations from the sentence that was returned by the API. This is why we created an additional function which does exactly that:

In [5]:
def get_movie_awards(title):
    base_url = "http://www.omdbapi.com/"
    params = {'apikey': "46e03b67", 't': title}

    # Make the request to the OMDb API
    response = requests.get(base_url, params=params)
    movie_data = response.json()

    # Check if the request was successful
    if response.status_code == 200 and movie_data['Response'] == 'True':
        return movie_data['Awards']
    else:
        print(f"Error: {movie_data['Error']}")
        return None
    
def extract_wins_nominations(awards_info):
    # Use regular expressions to extract wins and nominations
    wins_match = re.search(r'(\d+) win', awards_info, re.IGNORECASE)
    nominations_match = re.search(r'(\d+) nomination', awards_info, re.IGNORECASE)

    # Extract the numbers from the matches or return None if not found
    wins = int(wins_match.group(1)) if wins_match else None
    nominations = int(nominations_match.group(1)) if nominations_match else None

    return wins, nominations

movie_title = 'Taxi Driver'

# Get awards information
awards_info = get_movie_awards(movie_title)
wins, nominations = extract_wins_nominations(awards_info)

if awards_info is not None:
    print(f"Awards for {movie_title}: {awards_info}")
else:
    print(f"Failed to retrieve awards information.")

print("Wins:", wins, "| Nominations:", nominations)

Awards for Taxi Driver: Nominated for 4 Oscars. 22 wins & 21 nominations total
Wins: 22 | Nominations: 21


#### Actual scraping using the OMDb API

Now, the goal is to use all functions defined above to go through all of the movies inside of the dataset, request their revenues, ratings and number of awards, and add this data to the dataframe.

In [6]:
RAW.movie_data.shape

(39372, 9)

In [7]:
movie_titles = list(RAW.movie_data['Movie name'].values)
movie_titles

['Ghosts of Mars',
 'Getting Away with Murder: The JonBenét Ramsey Mystery',
 'The Gangsters',
 "Alexander's Ragtime Band",
 'Little city',
 'Henry V',
 'Mary Poppins',
 'New Rose Hotel',
 'Freddy and the Song of the South Pacific',
 'Road to Life',
 'Camera Thrills',
 'Ferdinando I, re di Napoli',
 'Anbu Thozhi',
 'Baby Boy',
 'Bindiya Chamkegi',
 'Karayilekku Oru Kadal Dooram',
 'Kausthubham',
 'Archie: To Riverdale and Back Again',
 'Vinayaka Geleyara Balaga',
 'Woman Hungry',
 'The Sky Pirate',
 'Behind The Player: John 5',
 'Joan the Woman',
 'Follow the Crowd',
 'Jay Vejay',
 'Hamsun',
 'The Great New Wonderful',
 'Loverboy',
 "Cally's Comet",
 'Chandra Mukhi',
 'Mesmerized',
 'Klingende toner',
 'Bruce Lee: The Lost Interview',
 'It Came Upon the Midnight Clear',
 'Vipers',
 'Shivers',
 'My Friend Ganesha',
 'The Tango Player',
 'Aaj Ka Andha Kanoon',
 'The Little Hut',
 'Right to Love',
 'Closing the Ring',
 'The Hero: Love Story of a Spy',
 '33 Scenes from Life',
 "The Pine's 

The new extended dataframe is a copy of the raw movie metadata dataframe:

In [8]:
movie_data_ft_ratings_awards = RAW.movie_data.copy()

First, we scrape the ratings:

In [None]:
rating_list = []
i = 0

for movie in movie_titles:
    i += 1
    print(i)

    rating = get_movie_rating(movie)
    rating_list.append(rating)

In [None]:
pd.Series(rating_list).to_csv("scraped_data/ratings.csv", index=False)

Then, the revenues (before scraping for each movie, we check if the revenue is missing so we just request movies whose data is missing):

In [None]:
revenue_list = []
i = 0

for movie in movie_titles:
    i += 1
    print(i)

    revenue_in_df = movie_data_ft_ratings_awards[movie_data_ft_ratings_awards['Movie name'] == movie]['Revenue'].iloc[0]

    # Only if we find a NaN in the revenue column we request the revenue with the API
    if np.isnan(revenue_in_df):
        revenue = get_movie_revenue(movie)
    else:
        revenue = revenue_in_df

    revenue_list.append(revenue)

In [None]:
pd.Series(revenue_list).to_csv("scraped_data/revenues.csv", index=False)

Finally, the awards:

In [None]:
award_list = []
i = 0

for movie in movie_titles[20000:]:
    i += 1
    print(i)

    awards = get_movie_awards(movie)
    print(awards)

    if awards == None:
        wins, nominations = (None, None)
    else:
        wins, nominations = extract_wins_nominations(awards)

        # If there are wins but no specified nominations, just give the number of wins to the number of nominations
        if nominations == None and wins != None:
            nominations = wins
    
    print(wins, nominations)

    print("----------")

    award_list.append((wins, nominations))

In [None]:
pd.Series(award_list).to_csv("scraped_data/awards.csv", index=False)

#### Dataframe creation

After saving the data in CSV files, we can grab them and place them into variables:

In [7]:
ratings  = list(pd.read_csv("scraped_data/ratings.csv")['0'].values)
revenues = list(pd.read_csv("scraped_data/revenues.csv")['0'].values)

awards = list(pd.read_csv("scraped_data/awards.csv")['0'].values)
awards = [ast.literal_eval(award_string) for award_string in awards]

wins, nominations = zip(*awards)

We now add new columns inside of the new dataframe:

In [9]:
movie_data_ft_ratings_awards['IMDb rating'] = ratings
movie_data_ft_ratings_awards['Revenue'] = revenues
movie_data_ft_ratings_awards['Wins'] = wins
movie_data_ft_ratings_awards['Nominations'] = nominations

movie_data_ft_ratings_awards

Unnamed: 0,Wiki ID,Freebase ID,Movie name,Release date,Revenue,Runtime,Languages,Countries,Genres,IMDb rating,Wins,Nominations
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",4.9,,2.0
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",6.0,,
5,13696889,/m/03cfc81,The Gangsters,1913-05-29,,35.0,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen...",5.9,,
7,10408933,/m/02qc0j7,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/04t36"": ""Musical"", ""/m/01z4y"": ""Comedy"", ...",6.9,1.0,6.0
12,6631279,/m/0gffwj,Little city,1997-04-04,,93.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...",5.8,,
...,...,...,...,...,...,...,...,...,...,...,...,...
81735,32468537,/m/0crwd9y,Shadow Boxing 2,2007-10-18,,132.0,"{""/m/06b_j"": ""Russian Language"", ""/m/02h40lc"":...","{""/m/06bnz"": ""Russia""}","{""/m/01z02hx"": ""Sports"", ""/m/0lsxr"": ""Crime Fi...",,,
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",4.6,,
81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,2647.0,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...",6.8,1.0,4.0
81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}",5.8,,


Here, we check that we indeed filled up the revenue column (the old dataframe contains over 32,000 whilst the new contains 28,000):

In [10]:
print(RAW.movie_data['Revenue'].isnull().sum())
print(movie_data_ft_ratings_awards['Revenue'].isnull().sum())

32579
28482


Finally, we can save the data inside of a CSV file in order to use it in other notebooks:

In [11]:
movie_data_ft_ratings_awards.to_csv("scraped_data/revenue_vs_rating_vs_awards.csv", index=False)