# introduction

this project is about jeopardy, a famous US TV show. it is about working with a dataset of jeopardy questions to figure out some patterns in the questions that could help to win.

In [1]:
import pandas as pd

In [2]:
jeopardy = pd.read_csv("jeopardy.csv")

In [3]:
jeopardy.head() # getting an overview of the dataset

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [4]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [5]:
# some cols have space infront, so removing it
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value',
       'Question', 'Answer']

In [6]:
jeopardy.columns # checking if space removed

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

## normalizing the data

In [7]:
import re

def normalize(string):
    lowercase = string.lower()
    no_punctuation = re.sub("[^A-Za-z0-9\s]", "", lowercase)
    return no_punctuation

# using function on cols "Question" and "Answer"

jeopardy["clean_question"] = jeopardy["Question"].apply(normalize)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize)

# checking results
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


In [8]:
# now normalizing the "Value" col (removing dollar sign and convert into integer)

def normalize_values(string):
    no_punctuation = re.sub("[^A-Za-z0-9\s]", "", string)
    try:
        text_int = int(no_punctuation)
    except:
        text_int = 0
    return text_int

jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_values)

# checking results
jeopardy.head(10)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant,in the title of an aesop fable this insect sha...,the ant,200
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way,built in 312 bc to link rome the south of ita...,the appian way,400
7,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$400,"No. 8: 30 steals for the Birmingham Barons; 2,...",Michael Jordan,no 8 30 steals for the birmingham barons 2306 ...,michael jordan,400
8,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$400,"In the winter of 1971-72, a record 1,122 inche...",Washington,in the winter of 197172 a record 1122 inches o...,washington,400
9,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$400,This housewares store was named for the packag...,Crate & Barrel,this housewares store was named for the packag...,crate barrel,400


In [9]:
jeopardy["clean_value"].dtype

dtype('int64')

In [10]:
jeopardy["Air Date"].dtype

dtype('O')

In [11]:
# converting col "Air Date" into datetime col
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])
jeopardy["Air Date"].dtype

dtype('<M8[ns]')

## analyzing the data

first step: how often is the answer deducible from the question 

In [12]:
def answer_check(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    #removing "the" from answer, because it often has no meaningful use
    if split_answer[0] == "the":
        split_answer = split_answer[1:]
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for i in split_answer:
        if i in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(answer_check, axis=1)

jeopardy["answer_in_question"].mean()

0.06237266880549054

### note
the answer only occur in 6% of the cases. this is not a huge number and for that it is not possible to create the question out of the answer with this approach.

so now it is time for the second step and to find out how often new questions are repeats of older questions.

In [13]:
question_overlap = []
terms_used = set()

for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for i in split_question:
        if i in terms_used:
            match_count += 1
    for i in split_question:
        terms_used.add(i)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)

jeopardy["question_overlap"] = question_overlap

jeopardy["question_overlap"].mean()

0.6908737315671878

### notes
there is an overlap of about 70%. this is only on a small dataset of jeopardy questions. also this overlap is not about phraes. but 70% seem to be worth it to take a closer look on repeating questions.

In [14]:
# categorizing the value column for counting purpose
def categorize_values(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(categorize_values, axis=1)

In [15]:
# function to count the first five words in the previous created "terms_used" seperated by "high_value" 1 or 0

def word_low_high(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if word in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

observed_expected = [] # empty list to append the results of the function

comparison_terms = list(terms_used)[:5] # the first five words

for i in comparison_terms: # appending values using the function
    observed_expected.append(word_low_high(i))
print(observed_expected)

[(0, 1), (0, 1), (1, 0), (0, 1), (3, 3)]


## chi-square and p-value

In [16]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []

for i in observed_expected:
    total = sum(i)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    observed = np.array([i[0], i[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=1.3346324449838385, pvalue=0.24798277007881564)]

### notes
no significant difference in usage between high value and low value rows. also the frequencies are under 5, so the chi-square test is not as valid.

In [17]:
# top 10 categories
jeopardy["Category"].value_counts(normalize=True)[:10] *100

TELEVISION          0.255013
U.S. GEOGRAPHY      0.250013
LITERATURE          0.225011
HISTORY             0.200010
AMERICAN HISTORY    0.200010
BEFORE & AFTER      0.200010
AUTHORS             0.195010
WORD ORIGINS        0.190010
WORLD CAPITALS      0.185009
BODIES OF WATER     0.180009
Name: Category, dtype: float64

In [18]:
jeopardy["Category"].describe()

count          19999
unique          3581
top       TELEVISION
freq              51
Name: Category, dtype: object

### notes
highest probability is on questions in television. but the value is 0.255 %, which is very low. also there are 3581 different categories overall. television is 51 (in total numbers) of that. so there is no category really to focus on.

In [26]:
tel_high = jeopardy[(jeopardy["Category"] == "TELEVISION") & (jeopardy["high_value"] == 1)].shape[0]
tel_all = jeopardy[jeopardy["Category"] == "TELEVISION"].shape[0]
tel_high_prob = tel_high / tel_all
tel_high_prob *100

11.76470588235294

### notes 
the television category is a category where only 11.76% are high valued questions

In [27]:
jeopardy["high_value"].value_counts(normalize=True)[:10] *100

0    71.328566
1    28.671434
Name: high_value, dtype: float64

### notes
compared to the overall probability of high valued questions (28.67%) the televison value for high valued questions is very low.