In [1]:
import pandas as pd
jeopardy = pd.read_csv('jeopardy.csv')

In [2]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [4]:
columns = list(jeopardy.columns)
columns = [i.strip() for i in columns]
columns

['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

In [5]:
jeopardy.columns = columns

In [6]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
Show Number    19999 non-null int64
Air Date       19999 non-null object
Round          19999 non-null object
Category       19999 non-null object
Value          19999 non-null object
Question       19999 non-null object
Answer         19999 non-null object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [7]:
import re
def normalize(x):
    x = x.lower()
    x = re.sub("\W"," ",x)
    x = re.sub("\s+", " ",x)
    x = x.strip()
    return x



In [8]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize)

In [9]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize)

In [10]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonald s
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


In [11]:
def normalize_value(x):
    x = re.sub("\W"," ",x)
    x = re.sub("\s+"," ",x)
    x = x.strip()
    try:
        x = int(x)
    except:
        x = 0
    return x

In [12]:
jeopardy['Value'] = jeopardy['Value'].apply(normalize_value)

In [13]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [14]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 9 columns):
Show Number       19999 non-null int64
Air Date          19999 non-null datetime64[ns]
Round             19999 non-null object
Category          19999 non-null object
Value             19999 non-null int64
Question          19999 non-null object
Answer            19999 non-null object
clean_question    19999 non-null object
clean_answer      19999 non-null object
dtypes: datetime64[ns](1), int64(2), object(6)
memory usage: 1.4+ MB


In [15]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonald s
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


In [16]:
def count_answer_in_question(row):
    split_answer = row['clean_answer'].split()
    split_question = row['clean_question'].split()
    match_count = 0
    if 'the' in split_question:
        while 'the' in split_question:
            split_question.remove('the')
    if len(split_answer) == 0:
        return 0
    for i in split_answer:
        if i in split_question:
            match_count += 1
    return match_count / len(split_answer)

question = count_answer_in_question(jeopardy.loc[0])

In [17]:
jeopardy['answer_in_question'] = jeopardy.apply(count_answer_in_question, axis=1)

In [18]:
mean_aswer_in_question = jeopardy['answer_in_question'].mean()
mean_aswer_in_question

0.059025038192746564

Pouquíssimas perguntas contém a resposta.

In [19]:
question_overlap = []
terms_used = set()
sorted_jeopardy = jeopardy.sort_values('Air Date')

In [20]:
sorted_jeopardy

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,answer_in_question
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,0,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,adventurous 26th president he was 1st to ride ...,theodore roosevelt,0.000000
19301,10,1984-09-21,Double Jeopardy!,LABOR UNIONS,200,Notorious labor leader missing since '75,Jimmy Hoffa,notorious labor leader missing since 75,jimmy hoffa,0.000000
19302,10,1984-09-21,Double Jeopardy!,1789,200,"Washington proclaimed Nov. 26, 1789 this first...",Thanksgiving,washington proclaimed nov 26 1789 this first n...,thanksgiving,0.000000
19303,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,200,Both Ferde Grofe' & the Colorado River dug thi...,the Grand Canyon,both ferde grofe the colorado river dug this n...,the grand canyon,0.000000
19304,10,1984-09-21,Double Jeopardy!,LITERATURE,200,"Depending on the book, he could be a ""Jones"", ...",Tom,depending on the book he could be a jones a sa...,tom,0.000000
19305,10,1984-09-21,Double Jeopardy!,HOMONYMS,200,Hindu hierarchy or a play's actors,a caste (cast),hindu hierarchy or a play s actors,a caste cast,0.333333
19306,10,1984-09-21,Double Jeopardy!,TV TRIVIA,200,"Last season, this series mourned the loss of S...",Hill Street Blues,last season this series mourned the loss of sg...,hill street blues,0.000000
19307,10,1984-09-21,Double Jeopardy!,1789,400,Why April 28th was a bad day for Capt. Bligh,the day of the mutiny on the Bounty,why april 28th was a bad day for capt bligh,the day of the mutiny on the bounty,0.125000
19308,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,400,Seaside resort that has a monopoly on East Coa...,"Atlantic City, New Jersey",seaside resort that has a monopoly on east coa...,atlantic city new jersey,0.000000
19309,10,1984-09-21,Double Jeopardy!,LITERATURE,400,"He wrote ""The 3 Musketeers""; his son wrote ""Ca...",(Alexandre) Dumas,he wrote the 3 musketeers his son wrote camille,alexandre dumas,0.000000


In [22]:
for index, row in jeopardy.iterrows():
    split_question = row['clean_question'].split()
    for word in split_question:
        if len(word) < 6:
            split_question.remove(word)
        match_count = 0
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)
    

In [24]:
jeopardy['question_overlap'] = pd.Series(question_overlap)

In [26]:
jeopardy.tail()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,answer_in_question,question_overlap
19994,3582,2000-03-14,Jeopardy!,U.S. GEOGRAPHY,200,"Of 8, 12 or 18, the number of U.S. states that...",18,of 8 12 or 18 the number of u s states that to...,18,1.0,0.125
19995,3582,2000-03-14,Jeopardy!,POP MUSIC PAIRINGS,200,...& the New Power Generation,Prince,the new power generation,prince,0.0,0.5
19996,3582,2000-03-14,Jeopardy!,HISTORIC PEOPLE,200,In 1589 he was appointed professor of mathemat...,Galileo,in 1589 he was appointed professor of mathemat...,galileo,0.0,0.125
19997,3582,2000-03-14,Jeopardy!,1998 QUOTATIONS,200,"Before the grand jury she said, ""I'm really so...",Monica Lewinsky,before the grand jury she said i m really sorr...,monica lewinsky,0.0,0.0
19998,3582,2000-03-14,Jeopardy!,LLAMA-RAMA,200,Llamas are the heftiest South American members...,Camels,llamas are the heftiest south american members...,camels,0.0,0.125


In [27]:
jeopardy['question_overlap'].mean()

0.12288923515473935

Aproximadamente 10% das questões são recicladas(?)

In [29]:
def high_value(value_col):
    if value_col > 800:
        value = 1
    else:
        value = 0
    return value

jeopardy['high_value'] = jeopardy['Value'].apply(high_value)
jeopardy.tail(30)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,answer_in_question,question_overlap,high_value
19969,5694,2009-05-14,Double Jeopardy!,AMERICAN HISTORY,1200,In 1960 the last of these locomotives was reti...,steam engines,in 1960 the last of these locomotives was reti...,steam engines,0.0,0.1,1
19970,5694,2009-05-14,Double Jeopardy!,"MIND YOUR SHAKESPEARE ""P""s & ""Q""s",1200,"Kate: ""if I be waspish, best beware my sting"";...",Petruchio,kate if i be waspish best beware my sting his ...,petruchio,0.0,0.1,1
19971,5694,2009-05-14,Double Jeopardy!,ALMA MATERS,0,This private college in Northern California bo...,Stanford University,this private college in northern california bo...,stanford university,0.0,0.1,0
19972,5694,2009-05-14,Double Jeopardy!,ACTRESSES,1200,"She voiced Princess Pea in ""The Tale of Desper...",Emma Watson,she voiced princess pea in the tale of despere...,emma watson,0.0,0.090909,1
19973,5694,2009-05-14,Double Jeopardy!,2-LETTER WORDS,1200,It's the name of the long-awaited new White Ho...,Bo,it s the name of the long awaited new white ho...,bo,0.0,0.166667,1
19974,5694,2009-05-14,Double Jeopardy!,ANGELS & DEMONS,1200,"Langdon in ""Angels & Demons"" is looking for <a...",an antimatter bomb,langdon in angels demons is looking for a href...,an antimatter bomb,0.333333,0.055556,1
19975,5694,2009-05-14,Double Jeopardy!,AMERICAN HISTORY,1600,In the 1600s most of New York State was occupi...,the Iroquois,in the 1600s most of new york state was occupi...,the iroquois,0.0,0.111111,1
19976,5694,2009-05-14,Double Jeopardy!,"MIND YOUR SHAKESPEARE ""P""s & ""Q""s",1600,Marina's dad (need a hint? he rules Tyre),Pericles,marina s dad need a hint he rules tyre,pericles,0.0,0.2,1
19977,5694,2009-05-14,Double Jeopardy!,ALMA MATERS,1600,Presidential kids are welcome at this New Orle...,Tulane,presidential kids are welcome at this new orle...,tulane,0.0,0.090909,1
19978,5694,2009-05-14,Double Jeopardy!,ACTRESSES,1600,She didn't vamp it up & did a bella job as Em ...,Kristen Stewart,she didn t vamp it up did a bella job as em in...,kristen stewart,0.0,0.142857,1


In [30]:
def count_terms(word):
    low_count = 0
    high_count = 0
    for index, row in jeopardy.iterrows():
        split_question = row['clean_question'].split()
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return low_count, high_count


In [46]:
from random import sample

comparison_terms = sample(terms_used, 10)
observed_expected = []

In [47]:
for word in comparison_terms:
    count = count_terms(word)
    observed_expected.append(list(count))

In [48]:
observed_expected

[[7, 3],
 [3, 2],
 [1, 0],
 [3, 1],
 [1, 0],
 [1, 0],
 [32, 14],
 [1, 0],
 [2, 1],
 [1, 0]]

In [49]:
high_value_count = jeopardy['high_value'].value_counts()[1]
low_value_count = jeopardy['high_value'].value_counts()[0]
chi_squared = []

In [57]:
from scipy.stats import chisquare
import numpy as np

for i in observed_expected:
    total = sum(i)
    total_prop = total/jeopardy.shape[0]
    exp_high = total_prop * high_value_count
    exp_low = total_prop * low_value_count
    
    obs = np.array([i[0], i[1]])
    exp = np.array([exp_low, exp_high])
    chi_squared.append(chisquare(obs,exp))

chi_squared

[Power_divergenceResult(statistic=0.14136090167759724, pvalue=0.7069318227511996),
 Power_divergenceResult(statistic=0.6134279937303524, pvalue=0.4335000217020267),
 Power_divergenceResult(statistic=0.3308710986890265, pvalue=0.565146603267378),
 Power_divergenceResult(statistic=4.122707846712507e-05, pvalue=0.9948769527982859),
 Power_divergenceResult(statistic=0.3308710986890265, pvalue=0.565146603267378),
 Power_divergenceResult(statistic=0.3308710986890265, pvalue=0.565146603267378),
 Power_divergenceResult(statistic=0.7649501956443188, pvalue=0.38178379753448455),
 Power_divergenceResult(statistic=0.3308710986890265, pvalue=0.565146603267378),
 Power_divergenceResult(statistic=0.11526980495624546, pvalue=0.7342224981885828),
 Power_divergenceResult(statistic=0.3308710986890265, pvalue=0.565146603267378)]

Nenhum resultado estatísticamente relevante. Todos p_values muito superiores à 0.05. Isso significa que, dentre os termos selecionados, não há discrepancia anormal entre a aparição em perguntas de alto ou baixo valor.