In [1]:
import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [2]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [5]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value',
       'Question', 'Answer']
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [21]:
import re

def qna_normalize(s):
    s = s.lower()
    s = re.sub("[^A-Za-z0-9\s]","",s)
    return s

def value_normalize(v):
    v = re.sub("[^A-Za-z0-9\s]","",v)
    try:
        v = int(v)
    except Exception:
        v = 0
    return v      
    


In [22]:
jeopardy["clean_question"] = jeopardy["Question"].apply(qna_normalize)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(qna_normalize)
jeopardy["clean_value"] = jeopardy["Value"].apply(value_normalize)
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])
jeopardy.head(2)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200


In [31]:
def count_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    for i in split_answer:
        if i in split_question:
            match_count += 1
    return match_count / len(split_answer)

In [32]:
answer_in_question = jeopardy.apply(count_matches, axis=1)
answer_in_question.mean()

0.060493257069335872

## Answer in Question
Above we show that on average 6% of an answer can be decuded from the question. This means simply relying on deducing the answer from the question will not work as a strategy and actual studying is necessary.

In [47]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [w for w in split_question if len(w) > 5]
    match_count = 0 
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap
jeopardy["question_overlap"].mean()

0.69087373156719623

## Question Overlap
About 70% of the words seen in the jeopardy questions are reused from  previous jeopardy questions. This might indicated that studying previous jeopardy questions is a good strategy. It might also be the case that a lot of the words used are fairly generic.

In [51]:
def question_value(row):
    if row["clean_value"] > 800:
        value = 1
    else:
        value = 0
    return value    
jeopardy["high_value"] = jeopardy.apply(question_value, axis=1)

In [54]:
def value_counts(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if word in row["clean_question"].split(" "):
            if row["high_value"]:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [56]:
observed_expected = []
comparison_terms = list(terms_used)[:5]
for term in comparison_terms:
    observed_expected.append(value_counts(term))
    
observed_expected

[(2, 0), (0, 2), (1, 0), (0, 1), (2, 2)]

In [64]:
import numpy as np
from scipy.stats import chisquare

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = obs[0] + obs[1]
    total_prop = total / jeopardy.shape[0]
    exp_high = high_value_count * total_prop
    exp_low = low_value_count * total_prop
    
    observed = np.array([obs[0],obs[1]])
    expected = np.array([exp_high,exp_low])
    chi_squared.append(chisquare(observed,expected))
    
chi_squared
                    

[Power_divergenceResult(statistic=4.9755842343913503, pvalue=0.025707519787911092),
 Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.88975496332255899, pvalue=0.34554371914834681)]

In [67]:
comparison_terms[2]

'digests'

##  High-value term Tests
None of the 5 terms considered had a p-value below .05 when applying the chi-squared test. However, any results we would have seen would be taken with a grain of salt since the number of observances of these words was very small.

# Further Analysis
Perform the chi-squared test across more terms to see what terms have larger differences. This is hard to do currently because the code is slow, but here are some ideas:

       Use the apply method to make the code that calculates frequencies more efficient. 
       Only select terms that have high frequencies across the dataset, and ignore the others.

In [None]:
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if word in row["clean_question"].split(" "):
            if row["high_value"]:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count