# setup for data collection

### web scraping metadata from IMDB

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException, TimeoutException, NoSuchElementException, StaleElementReferenceException
from datetime import datetime
import re
import time
import pandas as pd
import numpy as np

In [None]:
# Web scraping paths
url = 'https://www.imdb.com/title/tt0493378/episodes/?season=1'
num_seasons = 5
chrome_options = webdriver.ChromeOptions()
prefs = {
    "download.prompt_for_download": False,  # Disable download prompt
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True,  # Disable safe browsing
}
chrome_options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
time.sleep(1)

# XPaths
articles_xpath = '//*[@id="__next"]/main/div/section/div/section/div/div[1]/section[2]/section[2]/article'
episode_text_xpath = './/*[@class="ipc-title__text"]'
episode_date_xpath = './/span'
dropdown_button_xpath = './/div/span[1]/button'
next_season_button_xpath = '//*[@id="next-season-btn"]'

In [None]:
def try_to_click_dropdown(article, path=dropdown_button_xpath):
    try:
        time.sleep(2)
        button = article.find_element(By.XPATH, path)
        button.click()
        return True
    except NoSuchElementException:
        print('No dropdown on this page.')
        return True
    except TimeoutException:
        print('No dropdown timeout')
        return True
    except ElementClickInterceptedException:
        print('Intercepted exception')
        return False
    except StaleElementReferenceException:
        print('Stale element reference, re-trying...')
        return False

def try_to_click_next_page(path=next_season_button_xpath):
    next_page_button = driver.find_element(By.XPATH, path)
    driver.execute_script("arguments[0].scrollIntoView(true);", next_page_button)
    try:
        time.sleep(2)
        next_page_button.click()
        return True
    except ElementClickInterceptedException:
        print('Element intercept')
        return False
    except StaleElementReferenceException:
        print('Stale element reference, re-trying...')
        return False

def repeated_clicks(click_function, button, *args):
    stop = False
    j = 0
    while not stop:
        time.sleep(1)
        result = click_function(*args)
        if result:
            stop = result
            print(f'Button press result successful for {button} button.')
        else:
            j += 1
            print(f'Attempt {j} unsuccessful for {button} button.')
    print('\n')
    return

def extract_episode_date(article:str, date_format:str='%a, %b %d, %Y') -> dict:
    date_string = article.find_element(By.XPATH, episode_date_xpath).text
    return {'air_date' : datetime.strptime(date_string, date_format)}

def extract_episode_title(article:str) -> dict:
    title_string = article.find_element(By.XPATH, episode_text_xpath).text
    season_episode, title = [c.strip() for c in title_string.split('∙')]
    season, episode = [''.join(re.findall(r'\d', c)) for c in season_episode.split('.')]
    return {'season' : season, 'episode' : episode, 'title' : title}

def extract_episode_metadata(article) -> dict:
    return {**extract_episode_title(article), **extract_episode_date(article)}

#### webscraping the data

In [None]:

# Main loop
data = []
stop_loop = False
i = 1

while not stop_loop:
    time.sleep(2)
    
    try:
        next_season_button = driver.find_element(By.XPATH, next_season_button_xpath)
    except NoSuchElementException:
        print('All seasons completed, terminating loop.')
        stop_loop = True
        continue
    
    print(f'Scraping data for season {i}...')
        
    # Check dropdown
    print('Checking if dropdown needs to be clicked...')
    initial_display = driver.find_elements(By.XPATH, articles_xpath)

    # Scroll to the screen element
    time.sleep(3)
    driver.execute_script("arguments[0].scrollIntoView(true);", initial_display[-1])
    time.sleep(2)
    
    # Re-locate the last article element to avoid stale reference
    last_article = driver.find_elements(By.XPATH, articles_xpath)[-1]
    repeated_clicks(try_to_click_dropdown, 'dropdown', last_article)
    
    # Collect raw paths to data
    time.sleep(5)
    articles = driver.find_elements(By.XPATH, articles_xpath)
    for article in articles:
        data.append(extract_episode_metadata(article))
    
    # Go to next page
    repeated_clicks(try_to_click_next_page, 'next page')
    i += 1

driver.quit()
episode_metadata = pd.DataFrame(data)

In [None]:
episode_metadata.head()

### setting up the data collection board

In [None]:
## setting up the data collection board
rounds = [1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,4,4,4,5,5,6,7,8,9,10,11]
round_turns = [1,2,3,4,5,6,1,2,3,4,5,1,2,3,4,1,2,3,1,2,1,1,1,1,1,1]
base_board = pd.DataFrame({'round' : rounds, 'round_turn' : round_turns}).assign(
    case = '',
    value = '',
    offer = '',
    game_ended = '',
    original_case = '',
    winnings = '',
)
base_board.loc[base_board['round'] >= 10, 'game_ended'] = '1'
base_board.loc[(base_board['round'] == 10) & (base_board['round_turn'] == 1), 'original_case'] = '1'

In [None]:
total_games = episode_metadata.shape[0]
final_board = pd.concat([base_board.copy() for _ in range(total_games)])
final_board.insert(0, 'ID', np.repeat(range(1, total_games + 1), base_board.shape[0]))

In [None]:
final_board.to_csv('data_entry_template.csv', index=False)
episode_metadata.to_csv('dond_episode_metadata.csv', index=False)