# Testing notebook for scraping data 
This notebook will include test scripts to scrape the necessary data from various web sources.

## Scrape Bechdel test of movies
Scrape data from http://bechdeltest.com/ using its given API. Note that according to the owner, we should avoid calling the `getAllMovies` module frequently due to a shared hosting plan. Due to this, I ran the get requests once and saved the copy as a csv file.

In [1]:
import io
import re
import time
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

main_dir = "/home/jdtganding/Documents/bechdel-movies-project/data"

# html = requests.get('http://bechdeltest.com/api/v1/getAllMovies').content
# df = pd.read_json(io.StringIO(html.decode('utf-8')))
# df.to_csv(bechdel_movies, index=None)

bechdel_movies_df = pd.read_csv(f"{main_dir}/BechdelTestMovieList.csv")
# bechdel_movies_df['imdbid'] = bechdel_movies_df['imdbid'].astype(int)
bechdel_movies_df.sample(10)

FileNotFoundError: [Errno 2] No such file or directory: '/home/jdtganding/Documents/bechdel-movies-project/data/BechdelTestMovieList.csv'

In [10]:
bechdel_movies_df[bechdel_movies_df['imdbid'].isnull()]

Unnamed: 0,rating,imdbid,id,title,year
8604,3,,10131,Moana,2016
9152,1,,10659,No me digas solterona,2018
9153,1,,10661,Oceans 8,2018
9393,3,,10029,Decedants 3,2019
9699,3,,10231,Wrong Turn,2021


In [6]:
bechdel_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9908 entries, 0 to 9907
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   rating  9908 non-null   int64  
 1   imdbid  9903 non-null   float64
 2   id      9908 non-null   int64  
 3   title   9908 non-null   object 
 4   year    9908 non-null   int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 387.2+ KB


In [37]:
bechdel_movies_df[bechdel_movies_df.year < 1927]

Unnamed: 0,rating,imdbid,id,title,year
0,0,3155794.0,9602,Passage de Venus,1874
1,0,14495706.0,9804,La Rosace Magique,1877
2,0,2221420.0,9603,Sallie Gardner at a Gallop,1878
3,0,12592084.0,9806,Le singe musicien,1878
4,0,7816420.0,9816,Athlete Swinging a Pick,1881
...,...,...,...,...,...
198,1,16884.0,1260,Flesh and the Devil,1926
199,0,17925.0,1310,"General, The",1926
200,0,16895.0,1966,For Heaven&#39;s Sake,1926
201,3,17410.0,3217,So&#39;s Your Old Man,1926


## Scrape Oscars movie nominees and winners
The Academy Awards has their own database found on https://awardsdatabase.oscars.org/. I scraped the whole database from the 1st Academy Awards up to the latest using `selenium` and saved the page source as a variable that can be read using `BeautifulSoup`.

In [95]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

options = webdriver.FirefoxOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')

driver = webdriver.Firefox(options=options)
driver.get("https://awardsdatabase.oscars.org/") 

#select award categories
driver.find_element(By.XPATH,"//button[contains(@class,'awards-basicsrch-awardcategory')]").click()
driver.find_element(By.XPATH,"//b[contains(text(),'Current Categories')]").click()

#select starting award year
driver.find_element(By.XPATH,"//button[contains(@class,'awards-advsrch-yearsfrom')]").click()
driver.find_element(By.XPATH,"//div[@class='btn-group multiselect-btn-group open']//input[@value='1']").click()

#select ending award year
driver.find_element(By.XPATH,"//button[contains(@class,'awards-advsrch-yearsto')]").click()
year_latest = len(driver.find_elements(By.XPATH,"//div[@class='btn-group multiselect-btn-group open']//li"))-2
driver.find_element(By.XPATH,f"//div[@class='btn-group multiselect-btn-group open']//input[@value='{year_latest}']").click()

#search to view results
# driver.find_element(By.XPATH,'//*[@id="btnbasicsearch"]').click()

#wait for all results to show
# time.sleep(60)

try:
    #resultscontainer will contain all our needed Oscars data
    driver.find_element(By.XPATH, '//*[@id="resultscontainer"]')

except NoSuchElementException as error:
    print(error)
    print(f"Needed element still not found after 60 seconds delay")

#get html source for BeautifulSoup extraction
page_source = driver.page_source

#close driver
driver.close()
print("Driver closed")

Message: Unable to locate element: //*[@id="resultscontainer"]
Stacktrace:
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:186:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:398:5
element.find/</<@chrome://remote/content/marionette/element.js:300:16

Needed element still not found after 60 seconds delay
Driver closed


### Transform json into structured format such as csv
First, we will use BeautifulSoup to extract elements and clean the page source. Then we would want the following structure for our dataframe:
```python
df_structure = {
    "AwardYear":[],
    "AwardCeremonyNum":[],
    "Movie":[],
    "AwardCategory":[],
    "AwardStatus":[]
}
```

- `AwardYear`: the year the award was received
- `AwardCeremonyNum`: the nth annual ceremony award
- `Movie`: the title of the nominated film
- `AwardCategory`: the category the film was nominated for
- `AwardStatus`: whether the film was only nominated or had won

In [27]:
# soup = BeautifulSoup(page_source, "lxml")
# results_container = soup.find('div', {'id':'resultscontainer'})

# with open (f"{main_dir}/OscarsResultsContainerHTML.txt", "w") as file:
#     file.write(str(results_container))

In [2]:
results_container = open(f"{main_dir}/OscarsResultsContainerHTML.txt", "r")
results_container = BeautifulSoup(results_container, 'lxml')

award_year_all = results_container.find_all('div',class_='awards-result-chron result-group group-awardcategory-chron')

In [90]:
oscars_results = []

for award_year_group in award_year_all:

    df_structure = {
        "AwardYear":'',
        "AwardCeremonyNum":'',
        "Movie":[],
        "AwardCategory":[],
        "AwardStatus": []
    }

    #find the award year title
    award_year = award_year_group.find('div',class_='result-group-title')\
                                 .get_text(strip=True)

    #separate award year title to extract year
    key_split = award_year.split(" ")
    df_structure['AwardYear'] = key_split[0]
    df_structure['AwardCeremonyNum'] = re.findall(r'\d+',key_split[1])[0]
    
    #award category result subgroup (each contains award title and nominees)
    award_category_all = award_year_group.find_all('div',class_='result-subgroup subgroup-awardcategory-chron')

    for award_category_group in award_category_all:

        #find award title
        award_title = award_category_group.find('div',class_='result-subgroup-title')\
                                          .get_text(strip=True)
        
        try:
            #find nominated movies
            movies = [movie.get_text(strip=True) for movie in award_category_group\
                           .find_all('div', class_='awards-result-film-title')]

            #remove duplicates
            movies = list(set(movies)) 

            #find winning movie
            winner_group = award_category_group.find('span', {'title':'Winner'})\
                                               .find_next_sibling('div')

            winners = [movie.get_text(strip=True) for movie in winner_group\
                            .find_all('div', class_='awards-result-film-title')]

            #update df_structure movie and category lists
            count = len(movies)
            if count > 0:
                df_structure['Movie'].extend(movies)
                df_structure['AwardCategory'].extend(list(np.repeat([award_title],count)))

                #add winner/s
                categ_list = list(np.repeat(['nominated'],count))
                for winner in winners:
                    categ_list[movies.index(winner)] = 'won'

                df_structure['AwardStatus'].extend(categ_list)
      
        except AttributeError:
            pass

    #append dataframe to list
    oscars_results.append(pd.DataFrame(df_structure))

#concatenate all award year dataframe into one    
oscars_results_final = pd.concat(oscars_results).reset_index(drop=True)

#save data as a csv file
oscars_results_final.to_csv(f"{main_dir}/OscarsFullResults.csv", index=False)

## Collecting IMDB datasets

In [2]:
imdb_titles = pd.read_csv('https://datasets.imdbws.com/title.basics.tsv.gz', 
                          chunksize=500_000,
                          iterator=True,
                          sep='\t',
                          header=0)

In [3]:
count = 0
while True:
    count += 1
    imdb_titles_iter = next(imdb_titles)
    imdb_titles_iter.to_csv(f"{main_dir}/imdb_title_part{count}.csv", index=False)

  imdb_titles_iter = next(imdb_titles)


StopIteration: 

In [7]:
imdb_titles_iter['tconst'].str.replace('tt','').astype(int)

9500000    9393684
9500001    9393686
9500002    9393688
9500003    9393690
9500004    9393692
            ...   
9742689    9916848
9742690    9916850
9742691    9916852
9742692    9916856
9742693    9916880
Name: tconst, Length: 242694, dtype: int64

In [24]:
agg = imdb_titles.groupby('startYear')\
                 .agg({'startYear':'count'})\
                 .rename(columns={'startYear':'count'})\
                 .reset_index()
agg = agg[~agg.startYear.str.contains('N')]

In [55]:
agg[(agg.startYear > '1873')&(agg.startYear < '2024')]

Unnamed: 0,startYear,count
0,1874,1
1,1877,4
2,1878,3
3,1881,2
4,1882,2
...,...,...
140,2019,421086
141,2020,396868
142,2021,450984
143,2022,393100


## Using tmdb API to collect data

In [6]:
from configparser import ConfigParser

config = ConfigParser()
config.read('/home/jdtganding/Documents/bechdel-movies-project/api_keys.cfg')

API_KEY = config.get('tmdb', 'api_key')

In [50]:
total_movies = 0
for year in range(1874,2023):
    api = f'https://api.themoviedb.org/3/discover/movie?api_key={API_KEY}&primary_release_year={year}'
    response = requests.get(api)
    movies = response.json()
    total_movies += movies['total_results']

print(f"Total movies to scrape: {total_movies}")

Total movies to scrape: 679397


In [58]:
response = requests.get(f'https://api.themoviedb.org/3/discover/movie?api_key={API_KEY}&primary_release_year=2021&page=10')
movies_json = response.json()

movies_json['results'][0]

{'adult': False,
 'backdrop_path': '/byfW3SL4xExI3DDXxTpfFJiXn7P.jpg',
 'genre_ids': [18],
 'id': 853361,
 'original_language': 'da',
 'original_title': 'Du som er i himlen',
 'overview': 'One day at the end of the 1800s, 14-year-old Lise’s life is changed forever. She is the eldest of her siblings, the first in her family to go to school and full of hope and confidence in life. But when her mother goes into labour, it quickly appears that something is wrong. As night falls and the labour progresses, Lise begins to understand that a day that began in childhood might end with her becoming the woman of the house.',
 'popularity': 29.699,
 'poster_path': '/vbypiQf40ReopnfbW9eFKJmboA0.jpg',
 'release_date': '2021-09-19',
 'title': 'As in Heaven',
 'video': False,
 'vote_average': 6.5,
 'vote_count': 21}

In [12]:
movie_id = 853361
url = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={API_KEY}&append_to_response=credits&page=5'
response = requests.get(url)
movies = response.json()
movies

{'adult': False,
 'backdrop_path': '/byfW3SL4xExI3DDXxTpfFJiXn7P.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 18, 'name': 'Drama'}],
 'homepage': '',
 'id': 853361,
 'imdb_id': 'tt13269616',
 'original_language': 'da',
 'original_title': 'Du som er i himlen',
 'overview': 'One day at the end of the 1800s, 14-year-old Lise’s life is changed forever. She is the eldest of her siblings, the first in her family to go to school and full of hope and confidence in life. But when her mother goes into labour, it quickly appears that something is wrong. As night falls and the labour progresses, Lise begins to understand that a day that began in childhood might end with her becoming the woman of the house.',
 'popularity': 33.618,
 'poster_path': '/vbypiQf40ReopnfbW9eFKJmboA0.jpg',
 'production_companies': [{'id': 174380,
   'logo_path': None,
   'name': 'Motor ApS',
   'origin_country': ''}],
 'production_countries': [{'iso_3166_1': 'DK', 'name': 'Denmark'}],
 'release_d

In [52]:
#dictionary to contain total number of pages
total_pages = {}
url = f'https://api.themoviedb.org/3/discover/movie?api_key={API_KEY}'

for year in range(1900,1902):
    response = requests.get(f'{url}&primary_release_year={year}')
    movies = response.json()
    pages = movies['total_pages']

    #tmdb api only allows up to 500 pages maximum
    if pages > 501:
        total_pages[year] = 500
    elif pages == 0:
        pass
    else:
        total_pages[year] = pages

    #delay next API request
    time.sleep(5)

#dictionary to contain year and tmdb ids
tmdb_ids = {}

#collect top most popular tmdb ids per year
for year, pages in total_pages.items():
    #create empty list
    tmdb_ids[year] = []

    for page in range(1,pages+1):
        response = requests.get(f'{url}&primary_release_year={year}&page={page}')
        movies = response.json()
        ids = [movie['id'] for movie in movies['results']]
        tmdb_ids[year].extend(ids)

        #delay next page for 5 seconds
        time.sleep(5)

In [11]:
from pathlib import Path

datasets = ['title.basics.tsv.gz',
            # 'title.principals.tsv.gz',
            # 'title.crew.tsv.gz',
            'title.ratings.tsv.gz']

count = 0
for dataset in datasets: 
    count += 1
    filename = dataset.replace('.tsv.gz','')
    path = Path(f"imdb/{filename}_part{count:02}.csv")
    print(path)

imdb/title.basics_part01.csv
imdb/title.ratings_part02.csv
