In [1]:
import pandas as pd
import numpy as np
jeopardy = pd.read_csv("jeopardy.csv")
print(jeopardy.head(5))
print(jeopardy.columns)

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype

In [2]:
renamed_columns = {' Air Date':'Air Date', ' Round':'Round', ' Category':'Category', ' Value':'Value', ' Question':'Question', ' Answer':'Answer'}
jeopardy = jeopardy.rename(columns=renamed_columns)
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [3]:
jeopardy.dtypes

Show Number     int64
Air Date       object
Round          object
Category       object
Value          object
Question       object
Answer         object
dtype: object

In [4]:
def normalize(s):
    import string
    s = s.lower()
    s = "".join([c for c in s if c not in string.punctuation])
    return s

jeopardy["clean_question"] = jeopardy["Question"].apply(normalize)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize)

In [5]:
def normalize_dollars(s):
    import string
    s = "".join([c for c in s if c not in string.punctuation])
    try:
        i = int(s)
    except Exception:
        i = 0
    return i

jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_dollars)

In [6]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [7]:
test_series = jeopardy.iloc[0:2]
print(test_series)

   Show Number   Air Date      Round                         Category Value  \
0         4680 2004-12-31  Jeopardy!                          HISTORY  $200   
1         4680 2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES  $200   

                                            Question      Answer  \
0  For the last 8 years of his life, Galileo was ...  Copernicus   
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe   

                                      clean_question clean_answer  clean_value  
0  for the last 8 years of his life galileo was u...   copernicus          200  
1  no 2 1912 olympian football star at carlisle i...   jim thorpe          200  


In [8]:
def answer_in_question(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    for a in split_answer:
        if a in split_question:
            match_count += 1
    return match_count/len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(answer_in_question, axis=1)

In [9]:
mean_answer_in_question = jeopardy["answer_in_question"].mean()
mean_answer_in_question

0.060352773854698942

On average, the answer contains 6% of the question.  This probability is too low to be significant in the studying strategy.

In [12]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for q in split_question:
        if q in terms_used:
            match_count += 1
        terms_used.add(q)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap

In [13]:
print(jeopardy["question_overlap"].mean())

0.69195779922


About 69% of the words in the questions overlap.  This seems like a pretty significant amount.

In [14]:
def high_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value
jeopardy["high_value"] = jeopardy.apply(high_value, axis=1)

In [15]:
def high_count(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        clean_question = row["clean_question"].split(" ")
        if word in clean_question:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [16]:
observed_expected = []
terms_used = list(terms_used)
comparison_terms = terms_used[:5]
for term in comparison_terms:
    counts = high_count(term)
    observed_expected.append(counts)

In [17]:
observed_expected

[(0, 1), (9, 25), (0, 2), (0, 1), (0, 1)]

In [22]:
from scipy.stats import chisquare
import numpy as np
high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]
chi_squared = []
for i in observed_expected:
    total = sum(i)
    total_prop = total/jeopardy.shape[0]
    high_expected_count = total_prop * high_value_count
    low_expected_count = total_prop * low_value_count
    
    observed = np.array([i[0], i[1]])
    expected = np.array([high_expected_count, low_expected_count])
    chi_squared.append(chisquare(observed, expected))

In [23]:
chi_squared

[Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.080527656533886979, pvalue=0.77658361661135578),
 Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)]

None of the results have a p value smaller than 0.05, so there is nothing statistically significant to conclude.