# setup for data collection

### web scraping metadata from IMDB

In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException, TimeoutException
from datetime import datetime
import re
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np

#### constants for web scraping

In [9]:
## where data are located
url = 'https://www.imdb.com/title/tt0493378/episodes/?season=1'
num_seasons = 5
## css location of buttons on page to select in imdb
dropdown_selector = '#__next > main > div > section > div > section > div > div.sc-a83bf66d-1.gYStnb.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(2) > section.sc-46954f9-0.hOJNkT > article:nth-child(51) > div > span.ipc-see-more.sc-b79d292e-0.eAzFWZ.single-page-see-more-button > button'
next_page_selector = '#next-season-btn'
episodes_selector = '#__next > main > div > section > div > section > div > div.sc-a83bf66d-1.gYStnb.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(2) > section.sc-46954f9-0.hOJNkT > article'
episode_title_selector = 'div > div > div.sc-9115db22-4.kyIRYf > div.sc-9115db22-5.ewoZOO > h4 > div > a > div'
episode_date_selector = 'div > div > div.sc-9115db22-4.kyIRYf > div.sc-9115db22-5.ewoZOO > span'

In [None]:
# ## original attempt at data collection
# soups = []
# base_url = 'https://www.imdb.com/title/tt0493378/episodes/?season='
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
# num_seasons=5
# for i in range(1, num_seasons+1):
#     curr_response = requests.get(f'{base_url}{i}', headers=headers)
#     if curr_response.status_code == 200:
#         soups.append(BeautifulSoup(curr_response.text, 'html.parser'))
#         print(f'Response for season {i} was successful')
#         time.sleep(np.random.randint(5, 10))
#     else:
#         print(f"Failed to retrieve the page. Status code: {curr_response.status_code}")
# print(len(responses))

#### helper functions to collect data and tidy it for analysis

In [10]:
def repeated_click_attempts(driver, click_type:str=None, css_path:str=None, i:int=None):
    stop_bool = False
    j = 1
    while not stop_bool:
        try:
            WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_path))).click()
            stop_bool = True
            print(f'successfully clicked {click_type} for season {i+1} with try {j}')
        except ElementClickInterceptedException:
            print(f'failed {click_type} for season {i+1} on try {j}')
            j += 1
            continue
        except TimeoutException:
            print(f'season {i+1} does not have {click_type}')
            stop_bool = True
    return

In [11]:
def result_set_to_string(result_set):
    return result_set[0].text

In [12]:
def extract_episode_title(title_string:str) -> dict:
    title_string = result_set_to_string(title_string)
    season_episode, title = [c.strip() for c in title_string.split('∙')]
    season, episode = [''.join(re.findall(r'\d', c)) for c in season_episode.split('.')]
    return {'season' : season, 'episode' : episode, 'title' : title}

In [13]:
def extract_episode_date(date_string:str, date_format:str='%a, %b %d, %Y') -> dict:
    date_string = result_set_to_string(date_string)
    return {'air_date' : datetime.strptime(date_string, date_format)}

In [14]:
def extract_episode_metadata(title_string:str, date_string:str) -> dict:
    return {**extract_episode_title(title_string), **extract_episode_date(date_string)}

In [15]:
def get_imdb_episode_data():
    soups = []
    driver = webdriver.Chrome()
    print('launching url')
    driver.get(url)
    print('attempting to scrape web page')
    for i in range(num_seasons):
        ## activating dropdown
        repeated_click_attempts(driver=driver, click_type='dropdown', css_path=dropdown_selector, i=i)
        print()
        ## scrape the page and store results
        print(f'appending soup for season {i+1}')
        time.sleep(5)
        soups.append(BeautifulSoup(driver.page_source, 'html.parser').select(episodes_selector))
        print()    
        repeated_click_attempts(driver=driver, click_type='next page', css_path=next_page_selector, i=i)
    print('done')
    driver.quit()
    return soups

### organizing and tidying the data

#### webscraping the data

In [16]:
## getting the raw data
soups = get_imdb_episode_data()
## organizing the raw data
episode_metadata = pd.DataFrame([
    extract_episode_metadata(episode.select(episode_title_selector), episode.select(episode_date_selector)) ## extract the metadata
    for episodes in soups ## for all seasons
    for episode in episodes ## for all episodes in each season
])
## viewing the data
print(episode_metadata)

Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)


launching url
attempting to scrape web page
season 1 does not have dropdown

appending soup for season 1

failed next page for season 1 on try 1
successfully clicked next page for season 1 with try 2
season 2 does not have dropdown

appending soup for season 2

failed next page for season 2 on try 1
successfully clicked next page for season 2 with try 2
season 3 does not have dropdown

appending soup for season 3

failed next page for season 3 on try 1
successfully clicked next page for season 3 with try 2
season 4 does not have dropdown

appending soup for season 4

failed next page for season 4 on try 1
successfully clicked next page for season 4 with try 2
season 5 does not have dropdown

appending soup for season 5

failed next page for season 5 on try 1
successfully clicked next page for season 5 with try 2
done
Empty DataFrame
Columns: []
Index: []


In [None]:
## setting up the data collection board
rounds = [1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,4,4,4,5,5,6,7,8,9,10,11]
round_turns = [1,2,3,4,5,6,1,2,3,4,5,1,2,3,4,1,2,3,1,2,1,1,1,1,1,1]
base_board = pd.DataFrame({'round' : rounds, 'round_turn' : round_turns}).assign(
    case = '',
    value = '',
    offer = '',
    game_ended = '',
    original_case = '',
    winnings = '',
)
base_board.loc[base_board['round'] == 10, 'game_ended'] = '1'
base_board.loc[(base_board['round'] == 10) & (base_board['round_turn'] == 1), 'original_case'] = '1'

In [None]:
total_games = episode_metadata.shape[0]
final_board = pd.concat([base_board.copy() for _ in range(total_games)])
final_board.insert(0, 'ID', np.repeat(range(1, total_games + 1), base_board.shape[0]))

In [None]:
final_board.to_csv('data_entry_template.csv', index=False)
episode_metadata.to_csv('dond_episode_metadata.csv', index=False)