# Testing notebook for data ingestion
This notebook will include test scripts to ingest the necessary data from various web sources.

## Scrape Bechdel test of movies
Scrape data from http://bechdeltest.com/ using its given API. Note that according to the owner, we should avoid calling the `getAllMovies` module frequently due to a shared hosting plan. Due to this, I ran the get requests once and saved the copy as a csv file.

In [9]:
import io
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

main_dir = "/home/jdtganding/Documents/data-engineering-zoomcamp/week_7_PROJECT/data"

# html = requests.get('http://bechdeltest.com/api/v1/getAllMovies').content
# df = pd.read_json(io.StringIO(html.decode('utf-8')))
# df.to_csv(bechdel_movies, index=None)

bechdel_movies_df = pd.read_csv(f"{main_dir}/BechdelTestMovies.csv")
bechdel_movies_df.sample(10)

Unnamed: 0,rating,imdbid,id,title,year
4907,0,416449.0,1271,300,2006
4790,3,419773.0,4834,Gespenster,2005
440,2,31679.0,5946,Mr. Smith Goes to Washington,1939
9801,3,12048234.0,10426,Save the Cinema,2022
7900,3,2382422.0,8099,Jacky in the kingdom of women,2014
1156,1,58461.0,661,Per un pugno di dollari (A Fisful of Dollars),1964
7840,3,3104818.0,6724,"Like Sunday, Like Rain",2014
6648,3,1205558.0,4089,Hick,2011
4548,3,381707.0,4298,White Chicks,2004
9286,3,3281548.0,8949,Little Women,2019


## Scrape Oscars movie nominees and winners
The Academy Awards has their own database found on https://awardsdatabase.oscars.org/. I scraped the whole database from the 1st Academy Awards up to the latest using `selenium` and saved the page source as a variable that can be read using `BeautifulSoup`.

In [4]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

options = webdriver.FirefoxOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')

driver = webdriver.Firefox(options=options)
driver.get("https://awardsdatabase.oscars.org/") 

#select award categories
driver.find_element(By.XPATH,"//button[contains(@class,'awards-basicsrch-awardcategory')]").click()
driver.find_element(By.XPATH,"//b[contains(text(),'Current Categories')]").click()

#select starting award year
driver.find_element(By.XPATH,"//button[contains(@class,'awards-advsrch-yearsfrom')]").click()
driver.find_element(By.XPATH,"//div[@class='btn-group multiselect-btn-group open']//input[@value='1']").click()

#select ending award year
driver.find_element(By.XPATH,"//button[contains(@class,'awards-advsrch-yearsto')]").click()
year_latest = len(driver.find_elements(By.XPATH,"//div[@class='btn-group multiselect-btn-group open']//li"))-2
driver.find_element(By.XPATH,f"//div[@class='btn-group multiselect-btn-group open']//input[@value='{year_latest}']").click()

#search to view results
driver.find_element(By.XPATH,'//*[@id="btnbasicsearch"]').click()

#wait for all results to show
time.sleep(60)

#get html source for BeautifulSoup extraction
page_source = driver.page_source

#close driver
driver.close()

### Use BeautifulSoup to extract elements and clean the page source
I saved the data for `id=resultscontainer` to avoid the long execution of the above code.

In [27]:
# soup = BeautifulSoup(page_source, "lxml")
# results_container = soup.find('div', {'id':'resultscontainer'})

# with open (f"{main_dir}/results.txt", "w") as file:
#     file.write(str(results_container))

In [5]:
results_container = open(f"{main_dir}/results.txt", "r")
results_container = BeautifulSoup(results_container, 'lxml')

award_year_all = results_container.find_all('div',class_='awards-result-chron result-group group-awardcategory-chron')

In [6]:
oscars_results_dict = {}

for award_year_group in award_year_all:

    #find the award year title
    award_year = award_year_group.find('div',class_='result-group-title')\
                                 .get_text(strip=True)
    
    #award category result subgroup (each contains award title and nominees)
    award_category_all = award_year_group.find_all('div',class_='result-subgroup subgroup-awardcategory-chron')

    award_subgroup_dict = {}
    for award_category_group in award_category_all:

        #dictionary to contain movie lists
        movies_dict = {"nominated":[], "won":[]}

        #find award title
        award_title = award_category_group.find('div',class_='result-subgroup-title')\
                                          .get_text(strip=True)
        
        try:
            #find nominated movies
            movies = [movie.get_text(strip=True) for movie in award_category_group.find_all('div', class_='awards-result-film-title')]

            #find winning movie
            winner_group = award_category_group.find('span', {'title':'Winner'})\
                                               .find_next_sibling('div')

            movies_dict["won"] = [movie.get_text(strip=True) for movie in winner_group.find_all('div', class_='awards-result-film-title')] 

            #remove duplicates and the winner from nominated list
            movies_dict["nominated"] = list(set(movies) - set(movies_dict["won"])) 
            
        except AttributeError:
            pass

        award_subgroup_dict[award_title] = movies_dict
    oscars_results_dict[award_year] = award_subgroup_dict

In [59]:
#save the raw file as a json 
import json

with open (f"{main_dir}/oscars_results_raw.json", "w", encoding='utf-8') as output:
    json.dump(oscars_results_dict, output)

In [8]:
oscars_results_dict['1931/32 (5th)']['ACTOR']

{'nominated': ['Dr. Jekyll and Mr. Hyde', 'The Guardsman'],
 'won': ['The Champ']}

In [63]:
oscars_results_dict.keys()

dict_keys(['1927/28 (1st)', '1928/29 (2nd)', '1929/30 (3rd)', '1930/31 (4th)', '1931/32 (5th)', '1932/33 (6th)', '1934 (7th)', '1935 (8th)', '1936 (9th)', '1937 (10th)', '1938 (11th)', '1939 (12th)', '1940 (13th)', '1941 (14th)', '1942 (15th)', '1943 (16th)', '1944 (17th)', '1945 (18th)', '1946 (19th)', '1947 (20th)', '1948 (21st)', '1949 (22nd)', '1950 (23rd)', '1951 (24th)', '1952 (25th)', '1953 (26th)', '1954 (27th)', '1955 (28th)', '1956 (29th)', '1957 (30th)', '1958 (31st)', '1959 (32nd)', '1960 (33rd)', '1961 (34th)', '1962 (35th)', '1963 (36th)', '1964 (37th)', '1965 (38th)', '1966 (39th)', '1967 (40th)', '1968 (41st)', '1969 (42nd)', '1970 (43rd)', '1971 (44th)', '1972 (45th)', '1973 (46th)', '1974 (47th)', '1975 (48th)', '1976 (49th)', '1977 (50th)', '1978 (51st)', '1979 (52nd)', '1980 (53rd)', '1981 (54th)', '1982 (55th)', '1983 (56th)', '1984 (57th)', '1985 (58th)', '1986 (59th)', '1987 (60th)', '1988 (61st)', '1989 (62nd)', '1990 (63rd)', '1991 (64th)', '1992 (65th)', '1993

### Transform json into structured format such as csv
We would want the following structure for our dataframe:
```python
df_structure = {
    "AwardYear":[],
    "AwardCeremonyNum":[],
    "Movie":[],
    "AwardCategory":[],
    "AwardStatus":[]
}
```

- `AwardYear`: the year the award was received
- `AwardCeremonyNum`: the nth annual ceremony award
- `Movie`: the title of the nominated film
- `AwardCategory`: the category the film was nominated for
- `AwardStatus`: whether the film was only nominated or had won

In [13]:
df_list = []

for key, value in oscars_results_dict.items():

    df_structure = {}

    #separate key strings to extract year
    key_split = key.split(" ")
    df_structure['AwardYear'] = key_split[0]
    df_structure['AwardCeremonyNum'] = re.findall(r'\d+',key_split[1])[0]

    #extract all movies per award year
    

In [14]:
df_structure

{'AwardYear': '2022', 'AwardCeremonyNum': '95'}