# Introduction
Jeopardy is a popular TV show in the US where participants answer questions to win money. It's been running for a few decades, and is a major force in popular culture. In this project, we work with a dataset of Jeopardy questions to figure out some patterns in the questions.
# Data
The dataset is named `jeopardy.csv`, and contains about 20000 rows from the beginning of a full dataset of Jeopardy questions, which you can download [here](https://www.reddit.com/r/datasets/comments/1uyd0t/200000_jeopardy_questions_in_a_json_file).

In [1]:
import pandas as pd

In [2]:
df_jeopardy = pd.read_csv('jeopardy.csv')
df_jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
Show Number    19999 non-null int64
 Air Date      19999 non-null object
 Round         19999 non-null object
 Category      19999 non-null object
 Value         19999 non-null object
 Question      19999 non-null object
 Answer        19999 non-null object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [3]:
df_jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [4]:
df_jeopardy.describe(include = 'all')

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
count,19999.0,19999,19999,19999,19999,19999,19999
unique,,336,4,3581,76,19988,14963
top,,2007-11-13,Jeopardy!,TELEVISION,$400,[audio clue],Japan
freq,,62,9901,51,3892,5,22
mean,4312.730537,,,,,,
std,1374.121672,,,,,,
min,10.0,,,,,,
25%,3393.0,,,,,,
50%,4582.0,,,,,,
75%,5431.0,,,,,,


# Data cleaning
1. Rename columns after removing spacing
2. Text normalisation
    - Columns: `Question` and `Answer`
        - lowercase
        - punctuation removal
3. Value normalisation
    - Columns: `Value`
        - punctuation removal
        - integer convertion
4. Data-type convention: datetime
    -Columns: `Air Date`, from object

In [5]:
col_rm_s = dict(zip(df_jeopardy.columns,  df_jeopardy.columns.str.strip()))
df_jeopardy = df_jeopardy.rename(columns = col_rm_s)
df_jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [6]:
import re
df_jeopardy[['clean_question','clean_answer']] = df_jeopardy[['Question','Answer']].applymap(lambda x: x.lower())
df_jeopardy[['clean_question','clean_answer']] = df_jeopardy[['clean_question','clean_answer']].applymap(lambda x: re.sub(r'[^\w\s]','',x))
df_jeopardy[['clean_question','clean_answer']].head()

Unnamed: 0,clean_question,clean_answer
0,for the last 8 years of his life galileo was u...,copernicus
1,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,the city of yuma in this state has a record av...,arizona
3,in 1963 live on the art linkletter show this c...,mcdonalds
4,signer of the dec of indep framer of the const...,john adams


In [7]:
df_jeopardy['clean_value'] = df_jeopardy['Value'].apply(lambda x: re.sub(r'[^\w\s]','',x))

def cv_int_0(v):
    try:
        v = int(v)
    except ValueError:
        v = 0
    return v

df_jeopardy['clean_value'] = df_jeopardy['clean_value'].apply(lambda x: cv_int_0(x))
df_jeopardy['clean_value'].head()

0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64

In [8]:
df_jeopardy['Air Date'] = pd.to_datetime(df_jeopardy['Air Date'])
pd.to_datetime(df_jeopardy['Air Date']).describe() #?why

count                   19999
unique                    336
top       2007-11-13 00:00:00
freq                       62
first     1984-09-21 00:00:00
last      2012-01-19 00:00:00
Name: Air Date, dtype: object

In [9]:
df_jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

## Q1: How often the answer is deducible from the question.

In [10]:
def count_matches(row):
    
    split_answer = row['clean_answer'].replace('  ',' ').split(' ') # there is double white space used, long search.
    split_question = row['clean_question'].replace('  ',' ').split(' ')
    
#     split_answer, split_question = set(split_answer), set(split_question) #for no repeated words
    
    useless_words = ['the','a','of', 'and', 'to', 'with']
    
    for i in useless_words:
        if i in split_answer:
            split_answer.remove(i)
            
    matched_words = [word for word in split_answer if word in split_question]
    matched_count = len(matched_words)
    
    if len(matched_words) == 0:
        matched_ratio = 0 
        matched_words = 0 #use None is better
    else:
        matched_ratio = len(matched_words)/len(split_answer)
        
#     return matched_count
#     return matched_words
    return matched_ratio


df_jeopardy['answer_in_question'] = df_jeopardy.apply(count_matches, axis = 'columns')
# pd.options.display.max_colwidth = 1000
# df_jeopardy.loc[df_jeopardy['answer_in_question']!=0, ['clean_question', 'clean_answer', 'answer_in_question']].head()
# df_jeopardy[['clean_question', 'clean_answer', 'answer_in_question']]
'On average, there is {p:.2%} that answer related-term is in the question '.format(p = df_jeopardy['answer_in_question'].mean())

'On average, there is 4.10% that answer related-term is in the question '

## Q2. How often new questions are repeats of older questions.

In [16]:
question_overlap = []
terms_used = set()

df_jeopardy = df_jeopardy.sort_values(['Air Date'])

for i, r in df_jeopardy.iterrows():
    split_question = r['clean_question'].replace('  ',' ').split(' ')
    split_question = [w for w in split_question if len(w)>=6]
    
    match_count = 0
    
    for word in split_question:
        
        if word in terms_used:
            match_count += 1
        else:
            terms_used.add(word)
    
    if len(split_question)>0:
        question_overlap.append(match_count/len(split_question))
    else:
        question_overlap.append(0)
                
df_jeopardy['question_overlap'] = question_overlap

'On average, there is {p:.2%} that questions uses the old words '.format(p = df_jeopardy['question_overlap'].mean())          
            

'On average, there is 68.95% that questions uses the old words '

## Q3. Which terms correspond to high-value questions using a chi-squared test. 

Narrow down the questions into two categories:
- Low value: Any row where Value is less than 800.
- High value: Any row where Value is greater than 800

In [20]:
def h_value_q(row):
    value = 0
    if row['clean_value'] > 800:
        value = 1
    return value

df_jeopardy['h_value_q'] = df_jeopardy.apply(h_value_q, axis = 'columns')
df_jeopardy['h_value_q'].head()

19325    0
19274    0
19275    0
19276    0
19277    0
Name: h_value_q, dtype: int64

In [21]:
def count_usage(word):
    low_count, high_count = 0, 0
    for i, r in df_jeopardy.iterrows():
        split_question = r['clean_question'].replace('  ',' ').split(' ')
        if word in split_question:
            if r['h_value_q']==1:
                high_count += 1
            else:
                low_count +=1
    return high_count, low_count

obs_expected = []
comparison_term = list(terms_used)[:5]
for i in comparison_term:
    obs_expected.append(count_usage(i))
obs_expected

[(0, 1), (1, 1), (0, 2), (5, 13), (0, 1)]

In [31]:
from scipy.stats import chisquare
import numpy as np

hvq_count = (df_jeopardy["h_value_q"] == 1).sum()
lvq_count = (df_jeopardy["h_value_q"] == 0).sum()

chi_squared = []

for obs in obs_expected:
    total = sum(obs)
    total_prop = total/df_jeopardy.shape[0]
    expect_hvq_c = total_prop * hvq_count
    expect_lvq_c = total_prop * lvq_count
    
    observe = [obs[0],obs[1]]
    expect = [expect_hvq_c, expect_lvq_c]
    chi_squared.append(chisquare(observe, expect))

chi_squared

[Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.4448774816612795, pvalue=0.5047776487545996),
 Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571),
 Power_divergenceResult(statistic=0.007029106963070332, pvalue=0.93318382776185),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469)]