# Oscars.com Scraper and Data Saving: Stage 3

Data Collection, Integration and Preprocessing

Hochschule Luzern, 2024

Master's in Applied Information and Data Science

Dominik Bacher Suarez

The following code cleans the scraped data

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import json

pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', None)

## Load Stage 1 data

In [2]:
# Load stage 1 data
try:
    with open('../data/Bacher_Dominik_studentA_stage1.json', 'r') as file:
        raw_content = json.load(file)
except FileNotFoundError:
    print("File not found. Please check the file path.")
except json.JSONDecodeError:
    print("File is not a valid JSON.")

## Scrape Categories by Year

In [3]:
# Get all the categories given in the Oscars.
# This is needed for the next stage of the scraping, maiinly for the column names
all_categories = set()
categories_dictionary = dict()

for year, content in raw_content.items():
    # Iterate over the years and get the categories for each year
    soup = BeautifulSoup(content, "html.parser")
    categories = soup.find_all("div", {"class": "field--name-field-award-category-oscars"})
    all_categories.update([category.text.strip() for category in categories])
    # IMPURITY 2: Have to convert the set to a list to be able to save it to a CSV file
    categories_dictionary[year] = list(all_categories)

# IMPURITU 3: The categories are not sorted, so we sort them
all_categories = sorted(all_categories)

# Make dataframe with the columns as the categories, and index as the years
df_categories = pd.DataFrame(columns=all_categories, index=categories_dictionary.keys())
# IMPURITY 4: Many issued with the data types, infering fixes this
df_categories = df_categories.infer_objects(copy=False)
# IMPURITY 5: Fill the NaN values with 0, and convert the data type to integer
df_categories.fillna(int(0), inplace=True)
df_categories = df_categories.astype(int)
# Here we set the values to 1 if the category is present for the year
for year, category in categories_dictionary.items():
    for column in category:
        df_categories.loc[year, column] = 1

In [4]:
df_categories.sample(5)

Unnamed: 0,Actor in a Leading Role,Actor in a Supporting Role,Actress in a Leading Role,Actress in a Supporting Role,Animated Feature Film,Animated Short Film,Art Direction,Best Picture,Cinematography,Costume Design,Directing,Documentary (Feature),Documentary (Short Subject),Documentary Feature Film,Documentary Short Film,Film Editing,Foreign Language Film,International Feature Film,Irving G. Thalberg Memorial Award,Jean Hersholt Humanitarian Award,Live Action Short Film,Makeup,Makeup and Hairstyling,Music (Original Score),Music (Original Song),Production Design,Short Film (Animated),Short Film (Live Action),Sound,Sound Editing,Sound Mixing,Visual Effects,Writing (Adapted Screenplay),Writing (Original Screenplay)
2015,1,1,1,1,1,0,1,1,1,1,1,1,1,0,0,1,1,0,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1
2013,1,1,1,1,1,0,1,1,1,1,1,1,1,0,0,1,1,0,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1
2021,1,1,1,1,1,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1
2023,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1
2020,1,1,1,1,1,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1


## Process Awards Data

In [5]:
def get_movie_cast_html(award, actor=False):
    """ Get the movie cast for the award and category from the HTML content of oscars.org.

    Args:
        award (BeautifulSoup): The award data from oscars.org.
        actor (bool, optional): Whether to select award for an actor or a movie. Defaults to False.

    Returns:
        dict: The winner and nominees for the award.
    """
    if actor:
        index_cast, index_movie = 0, 1
    else:
        index_cast, index_movie = 1, 0

    # IMPURITY #6: The index of the cast and movie title might be switched depending on the award
    # First get the encasing div of the winner data
    winner_div = award.find('div', string=lambda text: text and 'Winner' in text).find_parent('div')
    # Get the actor and the movie title
    winner_cast = winner_div.find_all("div", {"class": "field__item"})[index_cast].text.strip()
    winner_movie = winner_div.find_all("div", {"class": "field__item"})[index_movie].text.strip()

    # Get the siblings from the winner div, these are the nominees
    nominees_divs = winner_div.find_parent('div').find_next_siblings('div')
    nominees_cast = []
    nominees_movies = []
    # Get the cast and movie title for each nominee
    for nominee in nominees_divs:
        nominees_cast.append(nominee.find_all("div", {"class": "field__item"})[index_cast].text.strip())
        nominees_movies.append(nominee.find_all("div", {"class": "field__item"})[index_movie].text.strip())

    # Save the data
    return {
        "winner_movie": winner_movie,
        "winner_cast": winner_cast,
        "nominees_movies": nominees_movies,
        "nominees_cast": nominees_cast
    }

In [6]:
# IMPURITY #7: The structure of the page is different for the awards for actors and actresses, so we need to handle them separately
exception_acting = ["actor", "actress", "international feature film"]

# create a dictionary to store all the awards
all_oscars_awards = dict()

for year, content in raw_content.items(): 
    soup = BeautifulSoup(content, "html.parser")
    # First filter the div with the awards
    content = soup.find("div", id="view-by-category-pane")
    # Select all the awards that are direct children of the div.field--name-field-award-categories
    all_awards = content.select("div.field--name-field-award-categories > div.field__item")

    all_awards_dict = dict()
    for award in all_awards:
        # Get the category of the award
        category = award.find("div", {"class": "field--name-field-award-category-oscars"}).text.strip()
        # Skip not wanted awards, since these are not for movie titles
        # Award's are given to single individuals, so we can skip these too
        if category == "Music (Original Song)" or "Award" in category:
            # IMPURITY #8: Music (Original Song) AND Award are not related to movie titles
            continue

        # Scraping categories for Actors and Actresses
        # First scarpe awards for actor and actresses since the structure is different, then the rest
        # IMPURITY 9: The years 2019 and 2018 have a different structure for the foreign language film category
        if any(keyword in category.lower() for keyword in exception_acting) or ("foreign language film" in category.lower() and year in [2018, 2019]):
            all_awards_dict[category] = get_movie_cast_html(award, actor=True)

        # Now the rest of the awards that are not for actors and actresses
        else:
            all_awards_dict[category] = get_movie_cast_html(award, actor=False)
    
    # Save the data for each year
    # IMPURITY 10: convert the year to an integer
    all_oscars_awards[int(year)] = all_awards_dict

In [7]:
print(f"The variable all_oscars_awards has {len(all_oscars_awards)} elements, and is of type {type(all_oscars_awards)}")

The variable all_oscars_awards has 16 elements, and is of type <class 'dict'>


In [8]:
# Visalize some random samples of the data
print("Random sample of award for Actor in a Leading Role in 2010:")
print(all_oscars_awards[2010]["Actor in a Leading Role"])

print("\n")

print("Random sample of winner for the award for Best Picture in 2014:")
print(all_oscars_awards[2014]["Best Picture"]["winner_movie"])

Random sample of award for Actor in a Leading Role in 2010:
{'winner_movie': 'Crazy Heart', 'winner_cast': 'Jeff Bridges', 'nominees_movies': ['Up in the Air', 'A Single Man', 'Invictus', 'The Hurt Locker'], 'nominees_cast': ['George Clooney', 'Colin Firth', 'Morgan Freeman', 'Jeremy Renner']}


Random sample of winner for the award for Best Picture in 2014:
12 Years a Slave


## Create Dataframe With All Awards data

In [9]:
# Make a unique list of all the movies that have been nominated or won an Oscar
unique_movies = set()
for year, ceremony in all_oscars_awards.items():
    for category, awardess in ceremony.items():
        # Add the winner and the nominees to the set
        unique_movies.update([awardess["winner_movie"]]) # <-- IMPURITY #11: Only one winner, so make it a list
        unique_movies.update(awardess["nominees_movies"])
unique_movies = sorted(unique_movies, key=lambda x: x.lower())

# Now create the dataframe
df_awards = pd.DataFrame(columns=["Movie Title", "Oscar Ceremony Year", "Cast Involved"] + all_categories,
                         index=range(len(unique_movies)))
df_awards["Movie Title"] = unique_movies

# Iterate over the years and the categories to fill the boolean of Winner and Nominated
for year, ceremony in all_oscars_awards.items():
    for category, awardess in ceremony.items():
        # WINNERS
        # Get Boolean column if the movie is a winner or not
        movie_title_name = df_awards["Movie Title"] == awardess["winner_movie"]
        # Add the year of the ceremony
        df_awards.loc[movie_title_name, "Oscar Ceremony Year"] = year
        # Add the cast involved
        df_awards.loc[movie_title_name, "Cast Involved"] = df_awards.loc[movie_title_name, "Cast Involved"].apply(
    lambda x: x + ", " + awardess["winner_cast"] if pd.notna(x) else awardess["winner_cast"] # <-- IMPURITY #12: Ignore NAs as they cause errors
)
        # Specify the boolean Winner for the category won
        df_awards.loc[movie_title_name, category] = "Winner"

        for nominees_movie, nominees_cast in zip(awardess["nominees_movies"], awardess["nominees_cast"]):
            # NOMINEES
            # Get Boolean column if the movie is nominated or not
            movie_title_name = df_awards["Movie Title"] == nominees_movie
            # Add the year of the ceremony
            df_awards.loc[movie_title_name, "Oscar Ceremony Year"] = year
            # Add the cast involved
            df_awards.loc[movie_title_name, "Cast Involved"] = df_awards.loc[movie_title_name, "Cast Involved"].apply(
                lambda x: x + ", " + nominees_cast if pd.notna(x) else nominees_cast
            )
            # Specify the boolean Nominated for the category nominated
            df_awards.loc[movie_title_name, category] = "Nominated" 

In [10]:
df_awards.sample(10)

Unnamed: 0,Movie Title,Oscar Ceremony Year,Cast Involved,Actor in a Leading Role,Actor in a Supporting Role,Actress in a Leading Role,Actress in a Supporting Role,Animated Feature Film,Animated Short Film,Art Direction,Best Picture,Cinematography,Costume Design,Directing,Documentary (Feature),Documentary (Short Subject),Documentary Feature Film,Documentary Short Film,Film Editing,Foreign Language Film,International Feature Film,Irving G. Thalberg Memorial Award,Jean Hersholt Humanitarian Award,Live Action Short Film,Makeup,Makeup and Hairstyling,Music (Original Score),Music (Original Song),Production Design,Short Film (Animated),Short Film (Live Action),Sound,Sound Editing,Sound Mixing,Visual Effects,Writing (Adapted Screenplay),Writing (Original Screenplay)
141,Cartel Land,2016,Matthew Heineman and Tom Yellin,,,,,,,,,,,,Nominated,,,,,,,,,,,,,,,,,,,,,,
423,Marriage Story,2020,"Adam Driver, Scarlett Johansson, Laura Dern, Randy Newman, Noah Baumbach and David Heyman, Produ...",Nominated,,Nominated,Winner,,,,Nominated,,,,,,,,,,,,,,,,Nominated,,,,,,,,,,Nominated
595,St. Louis Superman,2020,Smriti Mundhra and Sami Khan,,,,,,,,,,,,,Nominated,,,,,,,,,,,,,,,,,,,,,
541,RBG,2019,Betsy West and Julie Cohen,,,,,,,,,,,,Nominated,,,,,,,,,,,,,,,,,,,,,,
87,Barbie,2024,"Ryan Gosling, America Ferrera, Jacqueline Durran, David Heyman, Margot Robbie, Tom Ackerley and ...",,Nominated,,Nominated,,,,Nominated,,Nominated,,,,,,,,,,,,,,,,Nominated,,,,,,,Nominated,
840,West Side Story,2022,"Ariana DeBose, Janusz Kaminski, Paul Tazewell, Steven Spielberg, Steven Spielberg and Kristie Ma...",,,,Winner,,,,Nominated,Nominated,Nominated,Nominated,,,,,,,,,,,,,,,Nominated,,,Nominated,,,,,
800,Top Gun: Maverick,2023,"Eddie Hamilton, Tom Cruise, Christopher McQuarrie, David Ellison and Jerry Bruckheimer, Producer...",,,,,,,,Nominated,,,,,,,,Nominated,,,,,,,,,,,,,Winner,,,Nominated,Nominated,
22,A Prophet,2010,France,,,,,,,,,,,,,,,,,Nominated,,,,,,,,,,,,,,,,,
291,Hidden Figures,2017,"Octavia Spencer, Donna Gigliotti, Peter Chernin, Jenno Topping, Pharrell Williams and Theodore M...",,,,Nominated,,,,Nominated,,,,,,,,,,,,,,,,,,,,,,,,,Nominated,
173,Darkest Hour,2018,"Gary Oldman, Bruno Delbonnel, Jacqueline Durran, Kazuhiro Tsuji, David Malinowski and Lucy Sibbi...",Winner,,,,,,,Nominated,Nominated,Nominated,,,,,,,,,,,,,Winner,,,Nominated,,,,,,,,


## Outliers Correction

In [11]:
# IMPURITY #13: The movie titles have some outliers that need to be replaced with the propper movie title
outliers = {
    'Demián Bichir': 'A Better Life',
    'Dimanche/Sunday': 'Dimanche/Sunday (Sunday)',
    'La Maison en Petits Cubes': 'La Maison en Petits Cubes (The House of Small Cubes)',
    'Lavatory - Lovestory': 'Lavatory - Lovestory (Ubornaya istoriya - lyubovnaya istoriya)',
    'Les Mis_rables': 'Les Misérables',
    'Maggie Simpson in "The Longest Daycare"': 'Maggie Simpson in "The Longest Daycare" (The Longest Daycare)',
    'Manon on the Asphalt': 'Manon on the Asphalt (Manon sur le bitume)',
    "Marvel's The Avengers": "Marvel's The Avengers (The Avengers)",
    'Pit__k_ Mun Kaikki Hoitaa? (Do I Have to Take Care of Everything?)': 'Pitääkö mun kaikki hoitaa? (Do I Have to Take Care of Everything?)',
    'Rabbit à la Berlin': 'Rabbit à la Berlin (Królik po berlinsku)',
    "We Can’t Live without Cosmos" : "We Can't Live without Cosmos (Мы не можем жить без космоса)",
    "Bestia" : "Bestia (Beast)",
    "No Time To Die" : "No Time to Die"

}

df_awards["Movie Title"] = df_awards["Movie Title"].replace(outliers)

## Enrichment

### Enrichment One: Won Oscar Boolean

Add a column that specifies if a movies won any oscar or none at all

In [12]:
# Add winners, add to initial data though
def check_winner(row):
    categories = df_awards.drop(columns=["Movie Title", "Oscar Ceremony Year", "Cast Involved"]).columns
    for category in categories:
        if row.get(category) == 'Winner':
            return True
    return False

df_awards['Won_Oscars'] = df_awards.apply(check_winner, axis=1)

In [13]:
df_awards.sample(2)

Unnamed: 0,Movie Title,Oscar Ceremony Year,Cast Involved,Actor in a Leading Role,Actor in a Supporting Role,Actress in a Leading Role,Actress in a Supporting Role,Animated Feature Film,Animated Short Film,Art Direction,Best Picture,Cinematography,Costume Design,Directing,Documentary (Feature),Documentary (Short Subject),Documentary Feature Film,Documentary Short Film,Film Editing,Foreign Language Film,International Feature Film,Irving G. Thalberg Memorial Award,Jean Hersholt Humanitarian Award,Live Action Short Film,Makeup,Makeup and Hairstyling,Music (Original Score),Music (Original Song),Production Design,Short Film (Animated),Short Film (Live Action),Sound,Sound Editing,Sound Mixing,Visual Effects,Writing (Adapted Screenplay),Writing (Original Screenplay),Won_Oscars
654,The Descendants,2012,"George Clooney, Alexander Payne, Kevin Tent, Jim Burke, Alexander Payne and Jim Taylor, Producer...",Nominated,,,,,,,Nominated,,,Nominated,,,,,Nominated,,,,,,,,,,,,,,,,,Winner,,True
599,Star Wars: The Force Awakens,2016,"Maryann Brandon and Mary Jo Markey, John Williams, Matthew Wood and David Acord, Andy Nelson, Ch...",,,,,,,,,,,,,,,,Nominated,,,,,,,,Nominated,,,,,,Nominated,Nominated,Nominated,,,False


### Enrichment 2: Amount of Oscars won

Add a column that counts the amount of oscars won

In [14]:
def count_oscars_won(row):
    categories = df_awards.drop(columns=["Movie Title", "Oscar Ceremony Year", "Cast Involved"]).columns
    count = 0
    for category in categories:
        if row.get(category) == 'Winner':
            count += 1
    return count

df_awards['Oscars_Won'] = df_awards.apply(count_oscars_won, axis=1)

In [15]:
df_awards.sample(2)

Unnamed: 0,Movie Title,Oscar Ceremony Year,Cast Involved,Actor in a Leading Role,Actor in a Supporting Role,Actress in a Leading Role,Actress in a Supporting Role,Animated Feature Film,Animated Short Film,Art Direction,Best Picture,Cinematography,Costume Design,Directing,Documentary (Feature),Documentary (Short Subject),Documentary Feature Film,Documentary Short Film,Film Editing,Foreign Language Film,International Feature Film,Irving G. Thalberg Memorial Award,Jean Hersholt Humanitarian Award,Live Action Short Film,Makeup,Makeup and Hairstyling,Music (Original Score),Music (Original Song),Production Design,Short Film (Animated),Short Film (Live Action),Sound,Sound Editing,Sound Mixing,Visual Effects,Writing (Adapted Screenplay),Writing (Original Screenplay),Won_Oscars,Oscars_Won
279,Harry Potter and the Half-Blood Prince,2010,Bruno Delbonnel,,,,,,,,,Nominated,,,,,,,,,,,,,,,,,,,,,,,,,,False,0
142,Causeway,2023,Brian Tyree Henry,,Nominated,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,0


### Enrichment 3: Percentage of awards won againts all awards

In [16]:
total_oscars_by_year = df_awards.groupby('Oscar Ceremony Year')['Oscars_Won'].sum().reset_index()
total_oscars_by_year.rename(columns={'Oscars_Won': 'Total_Oscars_Awarded'}, inplace=True)

df_awards = pd.merge(df_awards, total_oscars_by_year, on='Oscar Ceremony Year', how='left')

df_awards['Percentage of Wins'] = round((df_awards['Oscars_Won'] / df_awards['Total_Oscars_Awarded']) * 100, 2)

In [17]:
df_awards.sample(2)

Unnamed: 0,Movie Title,Oscar Ceremony Year,Cast Involved,Actor in a Leading Role,Actor in a Supporting Role,Actress in a Leading Role,Actress in a Supporting Role,Animated Feature Film,Animated Short Film,Art Direction,Best Picture,Cinematography,Costume Design,Directing,Documentary (Feature),Documentary (Short Subject),Documentary Feature Film,Documentary Short Film,Film Editing,Foreign Language Film,International Feature Film,Irving G. Thalberg Memorial Award,Jean Hersholt Humanitarian Award,Live Action Short Film,Makeup,Makeup and Hairstyling,Music (Original Score),Music (Original Song),Production Design,Short Film (Animated),Short Film (Live Action),Sound,Sound Editing,Sound Mixing,Visual Effects,Writing (Adapted Screenplay),Writing (Original Screenplay),Won_Oscars,Oscars_Won,Total_Oscars_Awarded,Percentage of Wins
842,When Marnie Was There,2016,Hiromasa Yonebayashi and Yoshiaki Nishimura,,,,,Nominated,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,0,23,0.0
451,Mudbound,2018,"Mary J. Blige, Rachel Morrison, Screenplay by Virgil Williams and Dee Rees",,,,Nominated,,,,,Nominated,,,,,,,,,,,,,,,,,,,,,,,,Nominated,,False,0,22,0.0


## Save Data

In [18]:
df_awards.to_csv("../data/Bacher_Dominik_studentA_stage3.csv", index=False)

In [19]:
df_awards.columns

Index(['Movie Title', 'Oscar Ceremony Year', 'Cast Involved',
       'Actor in a Leading Role', 'Actor in a Supporting Role',
       'Actress in a Leading Role', 'Actress in a Supporting Role',
       'Animated Feature Film', 'Animated Short Film', 'Art Direction',
       'Best Picture', 'Cinematography', 'Costume Design', 'Directing',
       'Documentary (Feature)', 'Documentary (Short Subject)',
       'Documentary Feature Film', 'Documentary Short Film', 'Film Editing',
       'Foreign Language Film', 'International Feature Film',
       'Irving G. Thalberg Memorial Award', 'Jean Hersholt Humanitarian Award',
       'Live Action Short Film', 'Makeup', 'Makeup and Hairstyling',
       'Music (Original Score)', 'Music (Original Song)', 'Production Design',
       'Short Film (Animated)', 'Short Film (Live Action)', 'Sound',
       'Sound Editing', 'Sound Mixing', 'Visual Effects',
       'Writing (Adapted Screenplay)', 'Writing (Original Screenplay)',
       'Won_Oscars', 'Oscars_Won'