In [1]:
import pandas as pd

# Exploring the Dataset

In [2]:
jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [4]:
new_columns = []
for idx,column in enumerate(jeopardy.columns):
    if column[0] == " ":
        new_columns.append(column[1:])
    else:
        new_columns.append(column)
        
jeopardy.columns = new_columns
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [5]:
jeopardy.dtypes

Show Number     int64
Air Date       object
Round          object
Category       object
Value          object
Question       object
Answer         object
dtype: object

# Normalizing Columns

In [6]:
import string

In [7]:
def norm_text(text):
    translator = str.maketrans('','',string.punctuation)
    return text.lower().translate(translator)

In [8]:
jeopardy["clean_question"] = jeopardy["Question"].apply(norm_text)
jeopardy["clean_question"].head(5)

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object

In [9]:
jeopardy["clean_answer"] = jeopardy["Answer"].apply(norm_text)
jeopardy["clean_answer"].head(5)

0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object

In [10]:
def norm_value(value):
    translator = str.maketrans('','',string.punctuation)
    try:
        return int(value.translate(translator))
    except Exception:
        return 0

In [11]:
jeopardy["clean_value"] = jeopardy["Value"].apply(norm_value)
jeopardy["clean_value"].head(5)

0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64

In [12]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])
jeopardy["Air Date"].head(5)

0   2004-12-31
1   2004-12-31
2   2004-12-31
3   2004-12-31
4   2004-12-31
Name: Air Date, dtype: datetime64[ns]

# Working on the "looking for answers in questions" strategy

In [13]:
def answer_in_question(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    
    while "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0

    match_count = 0
    for each in split_answer:
        if each in split_question:
            match_count += 1
    
    return match_count/len(split_answer)    

In [14]:
jeopardy["answer_in_question"] = jeopardy.apply(answer_in_question, axis=1)

In [15]:
jeopardy["answer_in_question"].mean()

0.059737124385356791

# Analysis on "looking for answers in questions" strategy
The "looking for answers in questions" strategy consists on trying to trying to find the answers for the questions in the words of the questions itself.

Unfortunately, this strategy isn't very promissing, as (in averge) the words on the answer appears only about **6%** of the time in the question.

Obs.: The word "the" was disregarded in this analysis, as it is really common on both questions and answers and could lead to a misinterpretation.

# Working on "looking for old questions" strategy

In [16]:
jeopardy.sort_values("Air Date",inplace=True)
jeopardy.head(100)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,adventurous 26th president he was 1st to ride ...,theodore roosevelt,0,0.000000
19301,10,1984-09-21,Double Jeopardy!,LABOR UNIONS,$200,Notorious labor leader missing since '75,Jimmy Hoffa,notorious labor leader missing since 75,jimmy hoffa,200,0.000000
19302,10,1984-09-21,Double Jeopardy!,1789,$200,"Washington proclaimed Nov. 26, 1789 this first...",Thanksgiving,washington proclaimed nov 26 1789 this first n...,thanksgiving,200,0.000000
19303,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,$200,Both Ferde Grofe' & the Colorado River dug thi...,the Grand Canyon,both ferde grofe the colorado river dug this ...,the grand canyon,200,0.000000
19304,10,1984-09-21,Double Jeopardy!,LITERATURE,$200,"Depending on the book, he could be a ""Jones"", ...",Tom,depending on the book he could be a jones a sa...,tom,200,0.000000
19305,10,1984-09-21,Double Jeopardy!,HOMONYMS,$200,Hindu hierarchy or a play's actors,a caste (cast),hindu hierarchy or a plays actors,a caste cast,200,0.333333
19306,10,1984-09-21,Double Jeopardy!,TV TRIVIA,$200,"Last season, this series mourned the loss of S...",Hill Street Blues,last season this series mourned the loss of sg...,hill street blues,200,0.000000
19307,10,1984-09-21,Double Jeopardy!,1789,$400,Why April 28th was a bad day for Capt. Bligh,the day of the mutiny on the Bounty,why april 28th was a bad day for capt bligh,the day of the mutiny on the bounty,400,0.200000
19308,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,$400,Seaside resort that has a monopoly on East Coa...,"Atlantic City, New Jersey",seaside resort that has a monopoly on east coa...,atlantic city new jersey,400,0.000000
19309,10,1984-09-21,Double Jeopardy!,LITERATURE,$400,"He wrote ""The 3 Musketeers""; his son wrote ""Ca...",(Alexandre) Dumas,he wrote the 3 musketeers his son wrote camille,alexandre dumas,400,0.000000


In [17]:
question_overlap = []
terms_used = set()

for idx,row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [word for word in split_question if len(word) >= 6]
    
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
    
    terms_used.update(set(split_question))
    
    if len(split_question) > 0:
        question_overlap.append(match_count/len(split_question))
    else:
        question_overlap.append(0)

jeopardy["question_overlap"] = question_overlap

jeopardy["question_overlap"].mean()

0.68712428809667803

# Analysis on "looking for old questions" strategy
The "looking for old questions" strategy is about studying the old questions subjects believing that similar questions will occur in the new questions. 

It seems to be a very good strategy, as about **69%** of the relevant terms (words with more than 6 letters) appears in older questions.

# Chi Square Test

In [24]:
def is_high_value(row):
    if row["clean_value"] > 800:
        return 1
    else:
        return 0
    
jeopardy["high_value"] = jeopardy.apply(is_high_value,axis=1)
jeopardy["high_value"].head(25)

19325    0
19301    0
19302    0
19303    0
19304    0
19305    0
19306    0
19307    0
19308    0
19309    0
19310    0
19311    0
19312    0
19313    0
19314    0
19315    0
19316    0
19317    0
19318    1
19319    0
19320    1
19321    1
19322    1
19323    1
19300    0
Name: high_value, dtype: int64

In [19]:
def high_low_word_count(word):
    low_count = 0
    high_count = 0
    for idx,row in jeopardy.iterrows():
        col_split = row["clean_question"].split(" ")
        if word in col_split:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return low_count, high_count

observed_expected = []
comparison_terms = list(terms_used)[:5]

for each in comparison_terms:
    observed_expected.append(high_low_word_count(each))
    
observed_expected

[(1, 3), (1, 0), (1, 0), (2, 0), (0, 1)]

In [20]:
high_value_count = (jeopardy["high_value"] == 1).sum()
low_value_count = (jeopardy["high_value"] == 0).sum()

print(high_value_count)
print(low_value_count)

5734
14265


In [23]:
from scipy.stats import chisquare

chi_squared = []

for each in observed_expected:
    total = each[0] + each[1]
    total_prop = total/jeopardy.shape[0]
    expected_low = total_prop * low_value_count
    expected_high = total_prop * high_value_count
    chi_squared.append(chisquare(each, [expected_low, expected_high]))

chi_squared   

[Power_divergenceResult(statistic=4.1980229752219893, pvalue=0.040471136200959497),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047)]

# Analysis on Chi Square Test
There is just one word that presents a p-value with significance bell