# Set Up and Clean Data Set

In [21]:
import pandas as pd

jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [22]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [23]:
# remove spaces in columns

jeopardy.columns = jeopardy.columns.str.replace(' ','')    
jeopardy.columns

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [36]:
# use regular expressions to clean up questions and answers

import re

def normalize(s):
    s = str(s)
    s = s.lower()
    s = re.sub("[^A-Za-z0-9\s]", "", s)
    return s

In [37]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize)
jeopardy['clean_question'].head()

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object

In [38]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize)
jeopardy['clean_answer'].head()

0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object

In [39]:
# normalize the values column

def norm_values(s):
    s = normalize(s)
    try:
        i = int(s)
    except Exception:
        i = 0
    return i

In [40]:
jeopardy['clean_value'] = jeopardy['Value'].apply(norm_values)
jeopardy['clean_value'].head()

0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64

In [41]:
# fix the Airdate column

jeopardy['AirDate'] = pd.to_datetime(jeopardy['AirDate'])
jeopardy['AirDate'].head()

0   2004-12-31
1   2004-12-31
2   2004-12-31
3   2004-12-31
4   2004-12-31
Name: AirDate, dtype: datetime64[ns]

# Data Investigation

This section attempts to figure out two phenomena:

- How often the answer is deducible from the question.
- How often new questions are repeats of older questions.
- Whether a word is likely to appear in a "high" or "low" value question by chance.


In [48]:
# define a function that counts the percentage of the words in then answer (not "the") that also appears in the question.

def count_matches(row):
    split_answer = row['clean_answer'].split(' ')
    if 'the' in split_answer: 
        split_answer.remove('the')
    if len(split_answer) == 0: return 0
    
    split_question = row['clean_question'].split(' ')
    
    match_count = 0
    
    for w in split_answer:
        if w in split_question: match_count += 1
            
    return match_count / len(split_answer)
    

In [49]:
answer_in_question = jeopardy.apply(count_matches, axis=1)
answer_in_question.mean()

0.059357587183968614

The answer only appears in the question about 6% of the time.  This indicates that, sadly, we probably can't just hope that we can parrot back words from the question in our answer.

In [51]:
# develop a procedure that measures the overlap (of recurring words only, questions more than 6 words) between questions and previously-asked ones.

question_overlap = []
terms_used = set()

jeopardy = jeopardy.sort_values('AirDate')

for i, r in jeopardy.iterrows():
    split_question = r['clean_question'].split(' ')
    split_question = [w for w in split_question if not len(w) < 6]
    
    match_count = 0
    
    for w in split_question:
        if w in terms_used: match_count += 1
        terms_used.add(w)
        
    if len(split_question) > 0: match_count = match_count / len(split_question)
    
    question_overlap.append(match_count)
    
jeopardy['question_overlap'] = question_overlap

jeopardy['question_overlap']

84523     0.000000
84548     0.000000
84547     0.000000
84546     0.000000
84545     0.000000
84544     0.000000
84543     0.000000
84542     0.000000
84541     0.000000
84562     0.000000
84540     0.000000
84553     0.000000
84554     0.000000
84555     0.000000
84556     0.000000
84557     0.000000
84558     0.000000
84559     0.000000
84560     0.000000
84539     0.000000
84551     0.000000
84550     0.000000
84549     0.250000
84561     1.000000
84552     0.000000
84524     0.000000
84563     0.333333
84565     0.500000
84566     0.000000
84567     0.000000
            ...   
105939    1.000000
105937    0.800000
105936    1.000000
105935    1.000000
105934    1.000000
105933    0.833333
105946    1.000000
105945    0.875000
105947    1.000000
105948    1.000000
105949    0.500000
105940    1.000000
105944    1.000000
105941    1.000000
105951    0.777778
105929    0.800000
105943    1.000000
105950    1.000000
105938    1.000000
105952    1.000000
105954    0.857143
105955    1.

In [52]:
jeopardy['question_overlap'].mean()

0.8726962925837469

There is about 70% overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases, it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling of questions.

In [54]:
# make a dummy for whether a question is "high value" (ie worth $800 or more)

def value(r):
    v = 1 if r['clean_value'] > 800 else 0
    return v

jeopardy['high_value'] = jeopardy.apply(value, axis=1)        

In [57]:
# make a function that counts the number of times a word appears in a high value and low value question. 
# For the first five terms of the 'terms_used' variable, assign the high/low value counts to a list.


def count_usage(w):
    low_count = high_count = 0
    
    for i, r in jeopardy.iterrows():
        split_question = r['clean_question'].split(" ")
        if w in split_question:
            if r['high_value'] == 1: high_count += 1
            else: low_count += 1
                
    return high_count, low_count

observed_expected = []
comparison_terms = list(terms_used)[:5] # note that using the first five words is entirely arbitrary!

for t in comparison_terms:
    observed_expected.append(count_usage(t))
    
observed_expected

[(1, 2), (5, 10), (0, 1), (0, 2), (0, 1)]

In [58]:
# Run a Chi-square test on each of the five words selected.
# The Ho is that a word's value is indistinguishable from chance.

from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy['high_value'].sum()
low_value_count = len(jeopardy['high_value']) - high_value_count

chi_squared = []

for l in observed_expected:
    total = sum(l)
    total_prop = total / len(jeopardy)
    
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([l[0], l[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))
    
chi_squared

[Power_divergenceResult(statistic=0.03723409388907139, pvalue=0.846989214486915),
 Power_divergenceResult(statistic=0.18617046944535684, pvalue=0.6661232003372894),
 Power_divergenceResult(statistic=0.3949764642333513, pvalue=0.5296950912486695),
 Power_divergenceResult(statistic=0.7899529284667026, pvalue=0.3741143592744989),
 Power_divergenceResult(statistic=0.3949764642333513, pvalue=0.5296950912486695)]

None of the terms had a significant difference (p < 0.05) in usage between high value and low value rows. This indicates that whether a word is high or low value is not indistinguishable from chance.

Note, however, that the frequencies were lower than 5, which indicates that the chi-squared test isn't valid. It would be better to run this test with only terms that have higher frequencies.