# P13. Winning Jeopardy

Jeopardy is a TV show in the US where participant answer questions to win money. We will be exploring a dataset of jeopardy questions containing 20000 rows and try to figure out some patterns in it. It could help you win in your next Jeopardy contest.


In [2]:
import pandas as pd

jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.head(6)
print(jeopardy.columns)

#removing white space in front of column values
col_list = []
for col in jeopardy.columns:
    col_list.append(col.strip())
    
jeopardy.columns = col_list
print(jeopardy.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')
Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


In [3]:
#normalizing Question and Answer columns

def clean_qa(string):
    import re
    to_low = string.lower()
    #remove punctuation
    pattern = '[^A-Za-z0-9\s]'
    return re.sub(pattern, "", to_low)



In [4]:
jeopardy['clean_question'] = jeopardy['Question'].apply(clean_qa)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(clean_qa)

jeopardy.head(6)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant,in the title of an aesop fable this insect sha...,the ant


In [15]:
#Normalizing Value and Air Date columns

def normalize_value(string):
    import re
     #remove punctuation
    pattern = '[^A-Za-z0-9\s]'
    try:
        val = int(re.sub(pattern, '', string))
    except:
        val = 0
        
    return val



In [16]:
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_value)

In [17]:
#Normalize Air Date

jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])
jeopardy['Air Date'].head()



0   2004-12-31
1   2004-12-31
2   2004-12-31
3   2004-12-31
4   2004-12-31
Name: Air Date, dtype: datetime64[ns]

## Analyzing Answers in Questions

We can see how many times wornds in the answer also occur in the question, to see how often the answer is deducible from the question. 

In [18]:
def count_match(row):
    split_answer = row['clean_answer'].split(" ")
    split_question = row['clean_question'].split(" ")
    match_count = 0
    try:
        split_answer.remove('the')
    except: 
        split_answer
    
    if len(split_answer) == 0:
        return 0
        
    for w in split_answer:
        for q in split_question:
            if w==q:
                match_count += 1
    
    return match_count/len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(count_match, axis=1)

In [19]:
jeopardy['answer_in_question'].mean()

0.07372317056301256

The answer appears in the questions in about 7% of all the dataset. It doesn't seem to be very meaningful, we will have to explore the dataset more.

## Recycled Questions

Let's investigate how often new questions are repeats of older ones. 

In [20]:
question_overlap = []

terms_used = set()

for idx, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for w in split_question:
        if w in terms_used:
            match_count += 1
    for w in split_question:
        terms_used.add(w)
    if len(split_question) > 0:
        val = match_count/len(split_question)
    else:
        val = 0
    question_overlap.append(val)


jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()   

0.6908737315671962

There is about 70% of terms that overlap in new questions and terms in old questions. 

In [21]:
terms_used

{'remodeled',
 'collaboration',
 'coroners',
 'beaten',
 'entire',
 'transferrin',
 'trials',
 'recovery',
 'verona',
 'judases',
 'lubezki',
 'oliver',
 'aggies',
 'spanishnamed',
 'hrefhttpwwwjarchivecommedia20060307dj07wmv',
 'bombardment',
 'differ',
 'hrefhttpwwwjarchivecommedia20060601j24jpg',
 'lespanaye',
 'bulletin',
 'evergreen',
 'lifetime',
 'requests',
 'shylock',
 'yahwehs',
 'shania',
 'wubbulous',
 'outgunned',
 'articles',
 'internet',
 'pharaohs',
 'loyalists',
 'chorus',
 'innards',
 'impersonated',
 'minimize',
 'tearyeyed',
 'starbuck',
 'cerdic',
 'dutchman',
 'amounts',
 'amalthea',
 'annual',
 'illuminate',
 'controllers',
 'regulations',
 'barker',
 'viruses',
 'hrefhttpwwwjarchivecommedia20071203j05jpg',
 'printedout',
 'gardens',
 'haphazardly',
 'ebonys',
 'tiananmen',
 'parodied',
 'hanshin',
 'sultanate',
 'abdulaziz',
 'carolina',
 'sergey',
 'appetizer',
 'poivre',
 'baseline',
 'hrefhttpwwwjarchivecommedia20081222j19ajpg',
 'nicollier',
 'giants',
 'aru

## Low Value vs High Value Questions

In [22]:
def low_high(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return value

jeopardy['high_value'] = jeopardy.apply(low_high, axis=1)
jeopardy['high_value'].head()

0    0
1    0
2    0
3    0
4    0
Name: high_value, dtype: int64

In [23]:
def low_high_count(word):
    low_count = 0 
    high_count = 0
    for idx, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(' ')
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1 
    return high_count, low_count


In [24]:
observed_expected = []
terms_used = list(terms_used)
comparison_terms = terms_used[:5]

for t in comparison_terms:
    result = low_high_count(t)
    observed_expected.append(result)

In [25]:
comparison_terms
observed_expected

[(0, 1), (0, 2), (1, 0), (1, 7), (1, 5)]

## Applying the chi-squared test

In [26]:
high_value_count = jeopardy[jeopardy['high_value']==1].shape[0]
low_value_count = jeopardy[jeopardy['high_value']==0].shape[0]

from scipy.stats import chisquare
import numpy as np
chi_squared = []

for i in observed_expected:
    total = i[0] + i[1]
    total_prop = total/jeopardy.shape[0]
    expected_high_count = total_prop * high_value_count
    expected_low_count = total_prop * low_value_count
    observed = np.array([i[0], i[1]])
    expected = np.array([expected_high_count, expected_low_count])
    chi_square, p_value = chisquare(observed, expected)
    chi_squared.append([chi_square, p_value])
    
chi_squared

[[0.401962846126884, 0.5260772985705469],
 [0.803925692253768, 0.3699222378079571],
 [2.487792117195675, 0.11473257634454047],
 [1.0229964471766237, 0.31180929640924315],
 [0.42281054506129573, 0.515537958129453]]