In [5]:
import pandas as pd

jeopardy = pd.read_csv('jeopardy.csv')
print(jeopardy.head(5))


   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  


In [6]:
print(jeopardy.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [14]:
new_col = {}
for c in jeopardy.columns:
    new_col[c] = c.replace(' ','')
print(new_col)


{' Round': 'Round', ' Question': 'Question', 'Show Number': 'ShowNumber', ' Air Date': 'AirDate', ' Value': 'Value', ' Category': 'Category', ' Answer': 'Answer'}


In [15]:
jeopardy.rename(new_col, axis=1, inplace=True)

In [16]:
print(jeopardy.columns)

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


In [31]:
import string

def normalize_q_and_a(q_n_a_string):
    q_n_a_string = q_n_a_string.lower()
    for c in string.punctuation:
        q_n_a_string=q_n_a_string.replace(c,"")
    return q_n_a_string

In [32]:
jeopardy['clean_question']=jeopardy['Question'].apply(lambda x: normalize_q_and_a(x) )

In [34]:
jeopardy['clean_answer']=jeopardy['Answer'].apply(lambda x: normalize_q_and_a(x) )

In [35]:
print(jeopardy['Value'].head(15))

0     $200
1     $200
2     $200
3     $200
4     $200
5     $200
6     $400
7     $400
8     $400
9     $400
10    $400
11    $400
12    $600
13    $600
14    $600
Name: Value, dtype: object


In [40]:
def normalize_values(dollar):
    for c in string.punctuation:
        dollar=dollar.replace(c,"")
    try:
        dollar=int(dollar)
    except ValueError:
        dollar = 0
    return dollar



In [41]:
jeopardy['clean_value']=jeopardy['Value'].apply(lambda x: normalize_values(x))

In [42]:
print(jeopardy['AirDate'].head(15))

0     2004-12-31
1     2004-12-31
2     2004-12-31
3     2004-12-31
4     2004-12-31
5     2004-12-31
6     2004-12-31
7     2004-12-31
8     2004-12-31
9     2004-12-31
10    2004-12-31
11    2004-12-31
12    2004-12-31
13    2004-12-31
14    2004-12-31
Name: AirDate, dtype: object


In [48]:
jeopardy['AirDate'] = pd.to_datetime(jeopardy['AirDate'])

In [59]:
def check_if_answer_equals_question(jeo_row):
    split_answer = jeo_row['clean_answer'].split()
    if 'the' in split_answer:
        split_answer.remove("the")
    split_question = jeo_row['clean_question'].split()
    match_count = 0
    if len(split_answer) == 0:
        return 0
    else:
        for a in split_answer:
            if a in split_question:
                match_count = match_count + 1
        return match_count / len(split_answer)
    

In [60]:
jeopardy['answer_in_question'] = jeopardy.apply(lambda x: check_if_answer_equals_question(x), axis=1)

In [65]:
print(jeopardy['answer_in_question'].head(5))

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: answer_in_question, dtype: float64


In [62]:
mean_ans_in_que = jeopardy['answer_in_question'].mean()

In [63]:
print(mean_ans_in_que)

0.058861482035140716


5% of the answers are in questions

In [71]:
from numpy import mean
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split()
    for q in split_question:
        if len(q) < 6:
            split_question.remove(q)
    match_count=0
    for q in split_question:
        if q in terms_used:
            match_count = match_count + 1
        terms_used.add(q)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)

jeopardy['question_overlap'] = question_overlap
print(mean(question_overlap))


0.8005813469283605


80% of word repeats in questions

In [74]:
def more_than_800(jeo_row):
    if jeo_row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return value

jeopardy['high_value'] = jeopardy.apply(lambda row: more_than_800(row), axis=1)

In [75]:
def hign_and_low_questions(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        split_question = row['clean_question'].split()
        if word in split_question:
            if row['high_value'] == 1:
                high_count = high_count + 1
            else:
                low_count = low_count + 1
    return high_count, low_count

In [82]:
observed_expected = []
terms_used = list(terms_used)
comparison_terms = terms_used[0:5]

In [83]:
for t in comparison_terms:
    observed_expected.append(hign_and_low_questions(t))

In [84]:
print(observed_expected)

[(1, 2), (1, 2), (1, 8), (1, 1), (0, 1)]


In [86]:
high_value_count = len(jeopardy[jeopardy['high_value']==1])

In [87]:
print(high_value_count)

5734


In [88]:
low_value_count = len(jeopardy[jeopardy['high_value']==0])
print(low_value_count)

14265


In [95]:
from scipy.stats import chisquare
chi_squared = []
for oe in observed_expected:
    total = oe[0] + oe[1]
    total_prop = total / len(jeopardy)
    exp_high = total_prop * high_value_count
    exp_low = total_prop * low_value_count
    chi_val, pvalue = chisquare([oe[0] + oe[1]], [exp_high,exp_low])
    chi_squared.append(chi_val)
    print(pvalue, chi_val)

0.017264689440517393 5.6692648899676765
0.017264689440517393 5.6692648899676765
3.722667987131117e-05 17.00779466990303
0.05188385478668916 3.779509926645118
0.1692295619530395 1.889754963322559
