# Testing notebook for scraping data 
This notebook will include test scripts to scrape the necessary data from various web sources.

## Scrape Bechdel test of movies
Scrape data from http://bechdeltest.com/ using its given API. Note that according to the owner, we should avoid calling the `getAllMovies` module frequently due to a shared hosting plan. Due to this, I ran the get requests once and saved the copy as a csv file.

In [57]:
import io
import re
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

main_dir = "/home/jdtganding/Documents/data-engineering-zoomcamp/week_7_PROJECT/data"

# html = requests.get('http://bechdeltest.com/api/v1/getAllMovies').content
# df = pd.read_json(io.StringIO(html.decode('utf-8')))
# df.to_csv(bechdel_movies, index=None)

bechdel_movies_df = pd.read_csv(f"{main_dir}/BechdelTestMovies.csv")
bechdel_movies_df.sample(10)

Unnamed: 0,rating,imdbid,id,title,year
4199,1,265459.0,6239,One Hour Photo,2002
6654,3,1700467.0,4104,L&#39;art d&#39;aimer,2011
9131,3,6010628.0,10089,Zoe,2018
536,0,35881.0,5554,Fires Were Started,1943
6308,3,1307858.0,4274,All About Evil,2010
78,0,1672718.0,9949,Alger: rue Bab-Azoun,1896
4412,3,383206.0,8879,Barbie of Swan Lake,2003
4521,3,396184.0,2995,With Blood on My Hands: Pusher II,2004
227,2,18839.0,5984,"Docks of New York, The",1928
7914,3,3717242.0,8561,Out in the Night,2014


## Scrape Oscars movie nominees and winners
The Academy Awards has their own database found on https://awardsdatabase.oscars.org/. I scraped the whole database from the 1st Academy Awards up to the latest using `selenium` and saved the page source as a variable that can be read using `BeautifulSoup`.

In [4]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

options = webdriver.FirefoxOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')

driver = webdriver.Firefox(options=options)
driver.get("https://awardsdatabase.oscars.org/") 

#select award categories
driver.find_element(By.XPATH,"//button[contains(@class,'awards-basicsrch-awardcategory')]").click()
driver.find_element(By.XPATH,"//b[contains(text(),'Current Categories')]").click()

#select starting award year
driver.find_element(By.XPATH,"//button[contains(@class,'awards-advsrch-yearsfrom')]").click()
driver.find_element(By.XPATH,"//div[@class='btn-group multiselect-btn-group open']//input[@value='1']").click()

#select ending award year
driver.find_element(By.XPATH,"//button[contains(@class,'awards-advsrch-yearsto')]").click()
year_latest = len(driver.find_elements(By.XPATH,"//div[@class='btn-group multiselect-btn-group open']//li"))-2
driver.find_element(By.XPATH,f"//div[@class='btn-group multiselect-btn-group open']//input[@value='{year_latest}']").click()

#search to view results
driver.find_element(By.XPATH,'//*[@id="btnbasicsearch"]').click()

#wait for all results to show
time.sleep(60)

#get html source for BeautifulSoup extraction
page_source = driver.page_source

#close driver
driver.close()

### Use BeautifulSoup to extract elements and clean the page source
I saved the data for `id=resultscontainer` to avoid the long execution of the above code.

In [27]:
# soup = BeautifulSoup(page_source, "lxml")
# results_container = soup.find('div', {'id':'resultscontainer'})

# with open (f"{main_dir}/results.txt", "w") as file:
#     file.write(str(results_container))

In [5]:
results_container = open(f"{main_dir}/results.txt", "r")
results_container = BeautifulSoup(results_container, 'lxml')

award_year_all = results_container.find_all('div',class_='awards-result-chron result-group group-awardcategory-chron')

In [6]:
oscars_results_dict = {}

for award_year_group in award_year_all:

    #find the award year title
    award_year = award_year_group.find('div',class_='result-group-title')\
                                 .get_text(strip=True)
    
    #award category result subgroup (each contains award title and nominees)
    award_category_all = award_year_group.find_all('div',class_='result-subgroup subgroup-awardcategory-chron')

    award_subgroup_dict = {}
    for award_category_group in award_category_all:

        #dictionary to contain movie lists
        movies_dict = {"nominated":[], "won":[]}

        #find award title
        award_title = award_category_group.find('div',class_='result-subgroup-title')\
                                          .get_text(strip=True)
        
        try:
            #find nominated movies
            movies = [movie.get_text(strip=True) for movie in award_category_group.find_all('div', class_='awards-result-film-title')]

            #find winning movie
            winner_group = award_category_group.find('span', {'title':'Winner'})\
                                               .find_next_sibling('div')

            movies_dict["won"] = [movie.get_text(strip=True) for movie in winner_group.find_all('div', class_='awards-result-film-title')] 

            #remove duplicates and the winner from nominated list
            movies_dict["nominated"] = list(set(movies) - set(movies_dict["won"])) 
            
        except AttributeError:
            pass

        award_subgroup_dict[award_title] = movies_dict
    oscars_results_dict[award_year] = award_subgroup_dict

In [59]:
#save the raw file as a json 
import json

with open (f"{main_dir}/oscars_results_raw.json", "w", encoding='utf-8') as output:
    json.dump(oscars_results_dict, output)

### Transform json into structured format such as csv
We would want the following structure for our dataframe:
```python
df_structure = {
    "AwardYear":[],
    "AwardCeremonyNum":[],
    "Movie":[],
    "AwardCategory":[],
    "AwardStatus":[]
}
```

- `AwardYear`: the year the award was received
- `AwardCeremonyNum`: the nth annual ceremony award
- `Movie`: the title of the nominated film
- `AwardCategory`: the category the film was nominated for
- `AwardStatus`: whether the film was only nominated or had won

In [91]:
df_list = []
for key, values in oscars_results_dict.items():

    df_structure = {
        "AwardYear":'',
        "AwardCeremonyNum":'',
        "Movie":[],
        "AwardCategory":[],
        "AwardStatus":[]
    }

    #separate key strings to extract year
    key_split = key.split(" ")
    df_structure['AwardYear'] = key_split[0]
    df_structure['AwardCeremonyNum'] = re.findall(r'\d+',key_split[1])[0]

    #extract all movies per award year
    for categ, movies in values.items():
        movies_concat = sum(list(movies.values()),[])
        count = len(movies_concat)

        if count > 0:
            df_structure['Movie'].extend(movies_concat)
            df_structure['AwardCategory'].extend(list(np.repeat([categ],count)))

            for status, movie in movies.items():
                df_structure['AwardStatus'].extend(list(np.repeat([status],len(movie))))
    
    #append dataframe to list
    df_list.append(pd.DataFrame(df_structure))

#concatenate all award year dataframe into one    
oscars_results_df = pd.concat(df_list)

#save copy of dataframe
oscars_results_df.to_csv(f"{main_dir}/oscars_results_raw.csv", index=False)

In [88]:
oscars_results_df.sample(10)

Unnamed: 0,AwardYear,AwardCeremonyNum,Movie,AwardCategory,AwardStatus
6,1957,30,The Bridge on the River Kwai,ACTOR IN A SUPPORTING ROLE,nominated
78,2022,95,The Banshees of Inisherin,BEST PICTURE,nominated
31,1966,39,The Fortune Cookie,CINEMATOGRAPHY (Black-and-White),nominated
124,1942,15,Wake Island,OUTSTANDING MOTION PICTURE,nominated
5,1986,59,Platoon,ACTOR IN A SUPPORTING ROLE,nominated
62,1941,14,Russian Soil,DOCUMENTARY (Short Subject),nominated
24,2007,80,Atonement,ART DIRECTION,nominated
154,1943,16,Riding High,SOUND RECORDING,nominated
50,1968,41,The Odd Couple,FILM EDITING,nominated
62,1969,42,The Secret of Santa Vittoria,MUSIC (Original Score--for a motion picture [n...,nominated
