# Winning Jeopardy

In [72]:
import pandas
import csv
import numpy as np

jeopardy = pd.read_csv("JEOPARDY_CSV.csv")

jeopardy

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams
...,...,...,...,...,...,...,...
216925,4999,2006-05-11,Double Jeopardy!,RIDDLE ME THIS,$2000,This Puccini opera turns on the solution to 3 ...,Turandot
216926,4999,2006-05-11,Double Jeopardy!,"""T"" BIRDS",$2000,In North America this term is properly applied...,a titmouse
216927,4999,2006-05-11,Double Jeopardy!,AUTHORS IN THEIR YOUTH,$2000,"In Penny Lane, where this ""Hellraiser"" grew up...",Clive Barker
216928,4999,2006-05-11,Double Jeopardy!,QUOTATIONS,$2000,"From Ft. Sill, Okla. he made the plea, Arizona...",Geronimo


In [45]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [35]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

In [49]:
import re

def normalize_text(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    text = re.sub("\s+", " ", text)
    return text

def normalize_values(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text

In [52]:
jeopardy["clean_question"] = jeopardy[" Question"].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy[" Answer"].astype(str).apply(normalize_text)
jeopardy["clean_value"] = jeopardy[" Value"].apply(normalize_values)

In [54]:
jeopardy["Air Date"] = pandas.to_datetime(jeopardy[" Air Date"])

In [55]:
jeopardy.dtypes

Show Number                int64
 Air Date                 object
 Round                    object
 Category                 object
 Value                    object
 Question                 object
 Answer                   object
clean_question            object
clean_answer              object
clean_value                int64
Air Date          datetime64[ns]
dtype: object

In [56]:
def count_matches(row):
    split_answer = row["clean_answer"].split()
    split_question = row["clean_question"].split()
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis=1)    

In [57]:
jeopardy["answer_in_question"] .mean()

0.05792070323661354

## 1. Recycled Questions
#### On average, the answer only makes up for about 6% of the question. This isn't a huge number, and it means that we probably can't just hope that hearing a question will enable us to determine the answer. We'll probably have to study.

In [62]:
question_overlap = []
terms_used = set()

jeopardy = jeopardy.sort_values(' Air Date')

for i, row in jeopardy.iterrows():
    
    split_question = row['clean_question'].split(' ')
    split_question = [q for q in split_question if len(q)>5]
    
    match_count = 0 
    
    for word in split_question:
        if word in terms_used:
            match_count +=  1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0 :
        match_count /= len(split_question)
    question_overlap.append(match_count)
    
jeopardy['question_overlap'] = question_overlap

jeopardy['question_overlap'].mean()


0.8722241109136858

## 2. Low Value vs. High Value Questions
#### There is about a 70% overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases — it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling of questions.

In [64]:
def determine_value(row):
    value=0
    if row['clean_value'] > 800:
        value = 1
    return value



jeopardy['high_value'] = jeopardy.apply(determine_value, axis=1)

In [67]:
def count_usage(word):
    low_count = 0 
    high_count = 0 
    
    for i, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(' ')
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
        
    return high_count, low_count    
    
    
from random import choice
terms_used_list = list(terms_used)
comparison_terms = [choice(terms_used_list) for _ in range(10)]

observed_expected = []

for term in comparison_terms:
    observed_expected.append(count_usage(term))
    
observed_expected  

[(0, 1),
 (0, 1),
 (1, 0),
 (0, 1),
 (0, 3),
 (2, 2),
 (1, 2),
 (0, 1),
 (0, 1),
 (1, 4)]

In [74]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

KeyError: 'high_value'

## 3. Chi-Squared Results
#### None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies.