In [1]:
import pandas as pd

jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

# Exploring repeated questions

In [2]:
jeopardy[jeopardy.duplicated(subset = ' Question',keep=False)][' Question'].value_counts()

[audio clue]                                                                                                             5
Adam Levine                                                                                                              2
In Nicolai's opera "The Merry Wives of Windsor", this fat, funny rogue gets dumped into the river in a laundry basket    2
"His pride had cast him out from heaven, with all his host of rebel angels"                                              2
Poi, a luau treat, is made from these mashed roots                                                                       2
Common in Dixie, a razorback is a wild one of these                                                                      2
1967: "We rob banks"                                                                                                     2
These "fell great oaks"                                                                                                  2
Name:  Question,

In [3]:
jeopardy[' Question'].unique().shape

(19988,)

In [4]:
jeopardy[jeopardy.duplicated(subset = ' Question',keep=False)].sort_values(by = ' Question')

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
4358,3436,1999-07-12,Double Jeopardy!,NAME THE POET,$600,"""His pride had cast him out from heaven, with ...","John Milton (from ""Paradise Lost"")"
7141,5171,2007-02-19,Double Jeopardy!,NAME THE POET,$400,"""His pride had cast him out from heaven, with ...",Milton
2654,6125,2011-04-08,Double Jeopardy!,BEST MOVIE QUOTES EVER!,$400,"1967: ""We rob banks""",Bonnie and Clyde
18431,3520,1999-12-17,Double Jeopardy!,NAME THE MOVIE,$1000,"1967: ""We rob banks""",Bonnie and Clyde
15687,6260,2011-12-02,Double Jeopardy!,ROCK'S FRONTMEN & WOMEN,$1600,Adam Levine,Maroon 5
8508,5443,2008-04-16,Jeopardy!,ROCK & ROLL FRONTMEN,$1000,Adam Levine,Maroon 5
13382,3788,2001-02-07,Jeopardy!,MAMMALS,$300,"Common in Dixie, a razorback is a wild one of ...",boar (hog)
7686,5467,2008-05-20,Double Jeopardy!,MAMMALS,$800,"Common in Dixie, a razorback is a wild one of ...",hog
9169,3907,2001-09-04,Jeopardy!,FUN WITH OPERA,$500,"In Nicolai's opera ""The Merry Wives of Windsor...",Falstaff
15558,5468,2008-05-21,Double Jeopardy!,FUN WITH OPERA,$1200,"In Nicolai's opera ""The Merry Wives of Windsor...",Falstaff


In [5]:
%matplotlib inline  
jeopardy[' Category'].value_counts()

TELEVISION                             51
U.S. GEOGRAPHY                         50
LITERATURE                             45
HISTORY                                40
BEFORE & AFTER                         40
AMERICAN HISTORY                       40
AUTHORS                                39
WORD ORIGINS                           38
WORLD CAPITALS                         37
SPORTS                                 36
BODIES OF WATER                        36
SCIENCE                                35
MAGAZINES                              35
SCIENCE & NATURE                       35
RHYME TIME                             35
WORLD GEOGRAPHY                        33
ANNUAL EVENTS                          32
WORLD HISTORY                          32
HISTORIC NAMES                         32
FICTIONAL CHARACTERS                   31
BIRDS                                  31
IN THE DICTIONARY                      31
ISLANDS                                30
U.S. PRESIDENTS                   

In [6]:
jeopardy.columns = [name.strip() for name in jeopardy.columns]
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [7]:
import re

'''
 Regex Guide:  
     [^ ] means matching negation of contained elements 
     \s matches white space
'''
 
def normalize_text(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    return text

def normalize_values(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text

jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_text)
jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_values)

jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [8]:
def parse(series):
    split_answer = series['clean_answer'].split(' ')
    split_question = series['clean_question'].split(' ')
    
    
    if 'the' in split_answer:
        split_answer.remove('the')
        
    if len(split_answer) == 0:
        return 0
    
    match_count= 0
    
    for word in split_answer:
        if word in split_question:
            match_count+=1     
        
    return match_count/len(split_answer)



pd.set_option('max_colwidth', 150)

jeopardy['answer_in_question'] = jeopardy.apply(parse, axis=1)
#strategy1 = jeopardy[['clean_question','clean_answer','answer_in_question']][jeopardy['answer_in_question']>0]
#freq = strategy1['answer_in_question'].value_counts().sort_index(ascending=False)
jeopardy["answer_in_question"].mean()


0.060493257069335914

In [9]:
#freq

In [10]:
#import matplotlib.pyplot as plt
#%matplotlib inline 
#plt.bar(freq.index,freq.values,width=0.02)
#plt.title('Counts(y) of each % match(x) of answer in question')

From the mean, answer appears in question 6% of the time. To be specific, out of 19999 questions, only about 1700 of them have 50% of more of the answer appearing in the question. It is not a good strategy to guess answers from questions.

# Finding number of terms in question that match the term set
## 1st solution has higher count because term is added while looking through words in question. Repeat words in same question will be counted in 1st solution but not in 2nd


In [11]:
question_overlap= []
terms_used = set()
for index, row in jeopardy.iterrows():
    
    split_question = row['clean_question'].split(' ')
    split_question = [word for word in split_question if len(word)>=6]
    match_count=0
    for word in split_question:
        if word in terms_used:
            match_count+=1
        terms_used.add(word)
     
    if len(split_question) > 0:
            match_count /= len(split_question)
    question_overlap.append(match_count)

jeopardy['question_overlap'] =question_overlap
jeopardy["question_overlap"].mean()

        

0.6925960057338565

In [12]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        split_question = [q for q in split_question if len(q) > 5]
        match_count = 0
        for word in split_question:
            if word in terms_used:
                match_count += 1
        for word in split_question:
            terms_used.add(word)
        if len(split_question) > 0:
            match_count /= len(split_question)
        question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap

jeopardy["question_overlap"].mean()

0.6908737315671878

About 70% of the terms in newer questions have appeared in earlier questions. Studying previous questions is a good strategy. However this method only looks at single words. If there were high incidence of phrase or entire sentence matches, this strategy will be much be encouraged.

In [13]:
def determine_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(determine_value, axis=1)

jeopardy["high_value"] = jeopardy.apply(determine_value, axis=1)


def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

comparison_terms = list(terms_used)[:5]
observed_expected = []
for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected,comparison_terms

([(1, 0), (0, 1), (1, 0), (0, 1), (1, 0)],
 ['kandinsky', 'jacquelyn', 'monmouth', 'monkeyfaced', 'carloman'])

# Chi-Squared result


In [14]:

from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047)]

None of the 5 terms have pvalue below 0.05 so there are no satistically significant difference in usage between high value and low value rows. The frequencies were all small, making the chi squared test less valid. This test is more informative with higher frequencies