Step 1: Reading in the Dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
jeopardy = pd.read_csv("jeopardy.csv")
print(jeopardy.head(5))
print(jeopardy.columns)

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value', ' Question', ' Answer'], dtype='object')


In [5]:
# Remove the space in the column names
jeopardy.columns = [i.replace(" ","") for i in jeopardy.columns.tolist()]
jeopardy.columns

# Alternate
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 
                    'Question', 'Answer']

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question', 'Answer'], dtype='object')

In [7]:
import re
def normalise(str_in):
    str_in = str_in.lower()
    str_in = re.sub("[^A-Za-z0-9\s]", "", str_in)
    return(str_in)

def normalise_values(val_in):
    val_in = re.sub("[^A-Za-z0-9\s]", "", val_in)
    try:
        val_in = int(val_in)
    except Exception:
        val_in = 0
    return(val_in)

In [10]:
jeopardy["clean_question"] = jeopardy["Question"].apply(normalise)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalise)
jeopardy["clean_value"] = jeopardy["Value"].apply(normalise_values)
jeopardy["Air Date"] = pd.to_datetime(jeopardy["AirDate"])
jeopardy.head(3)

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,Air Date
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200,2004-12-31
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200,2004-12-31
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200,2004-12-31


In [11]:
def count_matches(rows):
    split_answer = rows["clean_answer"].split(" ")
    split_question = rows["clean_question"].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return(0)
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return(match_count/len(split_answer))

jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis = 1)
mean_ansinq = jeopardy["answer_in_question"].mean()

In [12]:
mean_ansinq

0.060493257069335872

Answer terms in the question:
The answer only appears in the question about 6% of the time. This isn't a huge number, and means that we probably can't just hope that hearing a question will enable us to figure out the answer. We'll probably have to study.

Indentifying Recycled questions:

In [15]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [i for i in split_question if len(i) >= 6]
    match_count = 0
    for item in split_question:
        if item in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap
jeopardy["question_overlap"].mean()

0.69087373156719623

Question overlap
There is about 70% overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases, it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling of questions.

Low value vs high value questions:

In [17]:
def determine_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(determine_value, axis = 1)

def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

observed_expected = []
comparison_terms = list(terms_used)[:5]
for item in comparison_terms:
    observed_expected.append(count_usage(item))

observed_expected

[(19, 25), (0, 1), (3, 0), (0, 4), (0, 1)]

Applying chi-square test

In [19]:
from scipy.stats import chisquare

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]
chi_squared = []

for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    print(observed)
    print(expected)
    chi_squared.append(chisquare(observed, expected))

chi_squared

[19 25]
[ 12.61543077  31.38456923]
[0 1]
[ 0.28671434  0.71328566]
[3 0]
[ 0.86014301  2.13985699]
[0 4]
[ 1.14685734  2.85314266]
[0 1]
[ 0.28671434  0.71328566]


[(4.5299939349304807, 0.033305735763283911),
 (0.40196284612688399, 0.52607729857054686),
 (7.4633763515870246, 0.0062966796687489992),
 (1.607851384507536, 0.20479409439225948),
 (0.40196284612688399, 0.52607729857054686)]