In [1]:
import pandas as pd

jeopardy = pd.read_csv("jeopardy.csv")

jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [2]:
print(jeopardy.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [3]:
# Remove space in the column names
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

In [4]:
# Write a function to normalize the text columns
def normalize_str(a_string):
    '''
    a_string is a string
    '''
    import re
    a_string = re.sub("[^A-Za-z0-9\s]", "", a_string)
    a_string = a_string.lower()
    return a_string

jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_str)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_str)

jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


In [5]:
# Write a function to normalize dollar values
def normalize_dollor(a_string):
    '''
    a_string is a string
    '''
    import re
    a_string = re.sub("[^A-Za-z0-9\s]", "", a_string)
    try:
        a_string = int(a_string)
    except Exception:
        a_string = 0
    return a_string

jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_dollor)

# Convert the Air Date column to the date
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200


In [6]:
# Calculate the probability to have the answer deducible from the question
def deduce_count(a_series):
    '''
    a_series is a row of jeopardy
    '''
    split_answer = a_series["clean_answer"].split(" ")
    split_question = a_series["clean_question"].split(" ")
    
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    
    match_count = 0
    for word in split_answer:
        if word in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(deduce_count, axis=1)
print(jeopardy["answer_in_question"].mean())

0.06049325706933587


On average, only approximately 6% of the words in the answer can be found in the question.

In [7]:
# Check if new questions are repeated old ones
question_overlap = []
terms_used = set()

jeopardy = jeopardy.sort_values("Air Date", ascending = True)

for idx, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [word for word in split_question if len(word) > 5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)

jeopardy["question_overlap"] = question_overlap

print(jeopardy["question_overlap"].mean())

0.6876260592169802


About 70% of the words in newer questions appeared before in older questions.

In [8]:
# Write a function to classify the values
def values_class(a_Series):
    '''
    a_Series is a row from a df
    '''
    if a_Series["clean_value"] > 800:
        value = 1
    else:
        value = 0
    return value
        
jeopardy["high_value"] = jeopardy.apply(values_class, axis=1)

jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question,question_overlap,high_value
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,adventurous 26th president he was 1st to ride ...,theodore roosevelt,0,0.0,0.0,0
19301,10,1984-09-21,Double Jeopardy!,LABOR UNIONS,$200,Notorious labor leader missing since '75,Jimmy Hoffa,notorious labor leader missing since 75,jimmy hoffa,200,0.0,0.0,0
19302,10,1984-09-21,Double Jeopardy!,1789,$200,"Washington proclaimed Nov. 26, 1789 this first...",Thanksgiving,washington proclaimed nov 26 1789 this first n...,thanksgiving,200,0.0,0.0,0
19303,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,$200,Both Ferde Grofe' & the Colorado River dug thi...,the Grand Canyon,both ferde grofe the colorado river dug this ...,the grand canyon,200,0.0,0.5,0
19304,10,1984-09-21,Double Jeopardy!,LITERATURE,$200,"Depending on the book, he could be a ""Jones"", ...",Tom,depending on the book he could be a jones a sa...,tom,200,0.0,0.0,0


In [9]:
# Write the function to count the appearance of difference word in high/low value questions
def word_count_value(word):
    low_count = 0
    high_count = 0
    for idx, row in jeopardy.iterrows():
        if word in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

observed_expected = []

comparison_terms = list(terms_used)[0:5]

for term in comparison_terms:
    observed_expected.append(word_count_value(term))

print(observed_expected)

[(2, 0), (1, 1), (2, 5), (1, 0), (1, 0)]


In [11]:
import numpy as np
from scipy.stats import chisquare
high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []

for observation in observed_expected:
    total = sum(observation)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    observed = np.array([observation[0], observation[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

print(chi_squared)

[Power_divergenceResult(statistic=4.97558423439135, pvalue=0.025707519787911092), Power_divergenceResult(statistic=0.4448774816612795, pvalue=0.5047776487545996), Power_divergenceResult(statistic=3.423170782846152e-05, pvalue=0.9953317740648371), Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047), Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047)]


There isn't any word showing a significant difference in the occurrence in between high and low value questions. However, these results are drawn based on a low number of samples, which are not so reliable.