# Build out scraper of Jeopardy archive
using data from https://j-archive.com, scrape all games to examine difficulty

In [104]:
import pandas as pd
import requests as re
from bs4 import BeautifulSoup
import time

### Find full URL List
Scrape all seasons of J! Archive to get full list of individual game links

In [None]:
# parse season page
raw_html = re.get("https://j-archive.com/listseasons.php").content
soup_doc = BeautifulSoup(raw_html, "html.parser")

In [None]:
# list through all seasons 
season_links =[]
for row in soup_doc.find_all("td"):
    if row and row.find("a"):
        season_links.append(row.find("a")["href"] )


In [None]:
# get most recent seasons
season_links=season_links[:-19]

In [None]:
# within each season link, scrape all game ids
game_links = []
for link in season_links:
    raw_html = re.get("https://j-archive.com/"+link).content
    soup_doc = BeautifulSoup(raw_html, "html.parser")
    for row in soup_doc.find_all("td"):
        if row and row.find("a"):
            game_links.append(row.find("a")["href"])

In [None]:
# write game links
pd.DataFrame(game_links).to_csv("game_links.csv", index=False)

In [112]:
game_ids = pd.read_csv("game_links.csv").iloc[:, 0].tolist()  # Extract the only column as a list

In [None]:
# convert game ids to URLs
game_ids = [item.replace("showgame.php?game_id=", "") for item in game_links]
game_ids[1:5]

['9155', '9154', '9153', '9152']

# Create Scraper
Create a df of the updated scraped lists and get a function that scrapes n number of previous games, then appends

In [None]:
# create initial lists
jep_round_clues = []
final_jep_list = []
problem_games = []

In [None]:
def jep_scraper(game_id_num):
    try:
        game_id = game_id_num
        my_url = "https://j-archive.com/showgame.php?game_id=" + str(game_id_num)
    
        raw_html = re.get(my_url).content
        soup_doc = BeautifulSoup(raw_html, "html.parser")
    
    
        game_date = soup_doc.find("h1").text # get game date
        game_comments = soup_doc.find(id="game_comments").text # get game comments -- tournament, streak, etc

        # coryat list is based on value of clues -- get combined coryat and list of coryat scores per contestant
        combined_coryat = int(soup_doc.find_all("h3")[6].text.replace("Combined Coryat: $", "").replace(",",""))
        coryat_list = []
        coryat_scores = soup_doc.find_all("table")[-1]
        if coryat_scores.find(class_="score_positive"):
            for coryat in coryat_scores.find_all(class_="score_positive"):
                coryat_list.append(int(coryat.text.replace("$", "").replace(',', '')))
        if coryat_scores.find(class_="score_negative"):
            for coryat in coryat_scores.find_all(class_="score_negative"):
                coryat_list.append(int(coryat.text.replace("$", "").replace(',', '')))
    
        coryat_avg = sum(coryat_list) / len(coryat_list)
    
        # get clues for each round
        for jep_round in [soup_doc.find(id="jeopardy_round"), soup_doc.find(id="double_jeopardy_round")]:
            for clue in jep_round.find_all(class_="clue"):
                clue_dict = {}
                try:
                    clue_dict['round'] = jep_round.find("h2").text #jeopardy or double jeopardy
                    clue_dict['game'] = game_id
                    clue_dict['game_comments'] = game_comments
                    clue_dict['game_date'] = game_date
                    clue_dict['coryat_combined'] = combined_coryat
                    clue_dict['coryat_list'] = coryat_list
                    clue_dict['coryat_avg'] = coryat_avg

                    # some clues dont have text since they are videos, so just try
                    try:
                        clue_dict['clue_text'] = clue.find(class_="clue_text").text
                        clue_dict['correct_response'] = clue.find(class_="correct_response").text
                    except:
                        print("----")
                        print("no clue text")
                        print(game_id)
                        print("----")

                    # initialize values for responses    
                    clue_dict['wrong_count'] = 0
                    clue_dict['triple_stump'] = False
                    clue_dict['correct'] = False
                    clue_dict['value'] = ""
                    clue_dict['dd'] = False
                    
                    # if values are present, change the clue_dict
                    if clue.find(class_="clue_value"):
                        clue_dict['value'] = int(clue.find(class_="clue_value").text.replace("$", "").replace(',', ''))
                    elif clue.find(class_="clue_value_daily_double"):
                        clue_dict['value'] = int(clue.find(class_="clue_value_daily_double").text.replace("DD: $", "").replace(',', ''))
                        clue_dict['dd'] = True
                
                    if clue.find_all(class_="wrong"):
                        for wrong in clue.find_all(class_="wrong"):
                            if wrong.text == "Triple Stumper":
                                clue_dict['triple_stump'] = True
                            else:
                                clue_dict['wrong_count']+=1
                        
                    if clue.find_all(class_="right"):
                        for right in clue.find_all(class_="right"):
                            if right:
                                clue_dict['correct'] = True
                
                    jep_round_clues.append(clue_dict)
                    
                except:
                    print("clue error!")
                    print(game_id)
                    print("-----")
                    problem_games.append(game_id)
        
        # get final jeopardy clues
        final_jep_dict = {}
        final_jep_dict['game'] = game_id
        final_jep_dict['game_comments'] = game_comments
        final_jep_dict['game_date'] = game_date
        final_jep_dict['wrong'] = 0
        final_jep_dict['right'] = 0
        if soup_doc.find(id="final_jeopardy_round").find(class_="wrong"):
            final_jep_dict['wrong'] = len(soup_doc.find(id="final_jeopardy_round").find_all(class_="wrong"))
        
        if soup_doc.find(id="final_jeopardy_round").find(class_="right"):
            final_jep_dict['right'] = len(soup_doc.find(id="final_jeopardy_round").find_all(class_="right"))
        
        final_jep_list.append(final_jep_dict)
    except:
        print("-----")
        print("oops!")
        print(game_id)
        print("-----")
        problem_games.append(game_id)


In [None]:
jep_round_clues = []
final_jep_list = []
problem_games = []

def save_to_csv():
    pd.DataFrame(jep_round_clues).to_csv("jeopardy_clues_2.csv", index=False)
    pd.DataFrame(final_jep_list).to_csv("final_jeopardy_2.csv", index=False)
    pd.DataFrame(problem_games).to_csv("problem_games_2.csv", index=False)


save_interval = 50  # Save every 50 games

for i, game_id in enumerate(game_ids):
    jep_scraper(game_id)
    
    if (i + 1) % save_interval == 0:
        save_to_csv()  # Save every 10 games
        print(f"Saved progress after {i+1} games...")  # Print only at save points
    
    time.sleep(1)

save_to_csv()  # Final save at the end

----
no clue text
9138
----
----
no clue text
9138
----
----
no clue text
9138
----
----
no clue text
9138
----
-----
oops!
9104
-----
Saved progress after 50 games...
-----
oops!
9092
-----
-----
oops!
9085
-----
----
no clue text
9079
----
----
no clue text
9079
----
----
no clue text
9079
----
----
no clue text
9077
----
----
no clue text
9077
----
----
no clue text
9075
----
----
no clue text
9075
----
----
no clue text
9075
----
----
no clue text
9075
----
----
no clue text
9067
----
----
no clue text
9067
----
----
no clue text
9062
----
----
no clue text
9062
----
----
no clue text
9062
----
----
no clue text
9052
----
----
no clue text
9052
----
----
no clue text
9051
----
----
no clue text
9051
----
----
no clue text
9050
----
----
no clue text
9050
----
Saved progress after 100 games...
----
no clue text
9044
----
----
no clue text
9041
----
----
no clue text
9037
----
----
no clue text
9036
----
----
no clue text
9028
----
----
no clue text
9020
----
Saved progress after 150

In [92]:
jep_round_clues = pd.DataFrame(jep_round_clues)

In [93]:
jep_round_clues['date'] = pd.to_datetime(jep_round_clues["game_date"].str.split(" - ").str[1], format="%A, %B %d, %Y")

In [96]:
jep_round_clues.head()

(240, 14)

In [213]:
pd.DataFrame(jep_round_clues).to_csv("jep_round_clues.csv", index=False)

In [214]:
pd.DataFrame(final_jep_list).to_csv("final_jep.csv", index=False)