In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [126]:
from __future__ import division

In [52]:
jeopardy = pd.read_csv("JEOPARDY_CSV.csv")

In [53]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [54]:
columns = list(jeopardy.columns)

In [55]:
columns = list(map(lambda x: x.strip(),columns))

In [56]:
columns

['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

In [57]:
jeopardy.columns = columns

In [58]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [59]:
jeopardy.columns

Index([u'Show Number', u'Air Date', u'Round', u'Category', u'Value',
       u'Question', u'Answer'],
      dtype='object')

In [60]:
# normalizing the question and answer columns

def normalize(str):
    str = str.lower()
    for c in string.punctuation:
        str= str.replace(c,"")
    return str

In [61]:
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize)

In [64]:
jeopardy.loc[:5,"clean_question"]

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
5    in the title of an aesop fable this insect sha...
Name: clean_question, dtype: object

In [65]:
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize)

In [66]:
jeopardy.loc[:5,"clean_answer"]

0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
5       the ant
Name: clean_answer, dtype: object

In [67]:
jeopardy.dtypes

Show Number        int64
Air Date          object
Round             object
Category          object
Value             object
Question          object
Answer            object
clean_question    object
clean_answer      object
dtype: object

In [68]:
# making the value column numeric

def norm_dollar(str):
    for c in string.punctuation:
        str = str.replace(c,"")
    try:
        val = int(str)
    except:
        val = 0
    return val


In [70]:
jeopardy["clean_value"] = jeopardy["Value"].apply(norm_dollar)

In [71]:
# converting the Air Date column to datetime format using the pd.to_datatime function

In [72]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

Questions we are trying to answer - 

How often the answer is deducible from the question.

How often new questions are repeats of older questions.

In [73]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200


### How often is the answer deducible from the question?

#### can answer this by checking how many times the words in the answer occur in the question 

In [81]:
st = "i am a poor woman"
words = st.split(" ")
if "i" in words:
    words.remove("i")
words

['am', 'a', 'poor', 'woman']

In [87]:
def match_question_answer(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return (match_count/len(split_answer))

In [88]:
jeopardy["answer_in_question"] = jeopardy.apply(match_question_answer,axis=1)

In [91]:
jeopardy["answer_in_question"].mean()

0.00643064583045222

In [92]:
len(jeopardy[jeopardy["answer_in_question"] != 0])

1395

In [93]:
jeopardy.shape[0]

216930

In [98]:
strategy_1 = jeopardy[jeopardy["answer_in_question"] != 0]

In [104]:
strategy_1["Category"].value_counts()

STUPID ANSWERS                       151
WHICH CAME FIRST?                     58
THE LARGEST IN AREA                   55
PUT 'EM IN ORDER                      47
THE NORTHERNMOST CAPITAL CITY         45
THE SMALLEST IN AREA                  39
THE MOST POPULOUS NATION              25
BORN FIRST                            25
MULTIPLE CHOICE                       21
A PROVERBIAL MESS                     17
THE NORTHERNMOST NATION               15
CHERCHEZ LA FEMME                     15
SPOT THE POOCH                        15
THE SOUTHERNMOST CAPITAL CITY         15
N.E.W.S.                              11
THE SOUTHERNMOST NATION               10
PICK THE OSCAR WINNER                 10
TV OR NOT TV                          10
NOT A STATE CAPITAL                   10
THE HIGHEST-SCORING SCRABBLE WORD     10
WHICH TV SHOW CAME FIRST?             10
NOT A NATIONAL CAPITAL                10
SPOT THE KITTY                        10
THE LARGEST U.S. STATE                10
NOT A CURRENT NA

In [105]:
# clearly there is a good chance that the answer can be deduced from the question itself

# these could be stupid answers,multiple choice questions, etc as we can see above

# This could surely influence the studying strategy.

### How often are new questions repeat of older questions

In [108]:
jeopardy.sort_values("Air Date",inplace=True)

In [127]:
question_overlap = []
# set is an unordered collection of unique elements
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    
    for term in split_question:
        if len(term) < 6:
            split_question.remove(term)
   
    match_count = 0
    for term in split_question:
        if term in terms_used:
            match_count += 1
        terms_used.add(term)
    if len(split_question) > 0:
        match_count = match_count/len(split_question)
    question_overlap.append(match_count)
        
        

In [129]:
np.mean(question_overlap)

0.92843054254793067

In [130]:
np.sum(question_overlap)

201404.43759492261

In [131]:
quest = np.array(question_overlap)

In [140]:
# clearly the mean is very very high, which goes to say that there is a high chance that a particular question 
# is repeated from a previous question

# so we can conclude with a fair confidence that it would be a good strategy to go through previous questions

### Let's say we only want to prepare for high value questions instead of low value questions

In [142]:
def value_800(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

In [143]:
jeopardy["high_value"] = jeopardy.apply(value_800,axis=1)

In [162]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [167]:
comparison_terms = list(terms_used)[1:5]
observed_expected = []
for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected

[(78, 162), (23546, 53016), (48, 117), (54, 150)]

In [169]:
observed_expected

[(78, 162), (23546, 53016), (48, 117), (54, 150)]

In [170]:
from scipy.stats import chisquare

In [171]:
high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]
chi_squared = []

for obs in observed_expected:
    total = sum(obs)
    total_proportion = total/jeopardy.shape[0]
    high_exp = total_proportion * high_value_count
    low_exp = total_proportion * low_value_count
    
    observed = np.array([obs[0],obs[1]])
    expected = np.array([high_exp,low_exp])
    chi_squared.append(chisquare(observed,expected))
    
chi_squared

[Power_divergenceResult(statistic=2.0717157309190206, pvalue=0.15005263160953666),
 Power_divergenceResult(statistic=224.56406493500208, pvalue=9.138816144314009e-51),
 Power_divergenceResult(statistic=0.049041099860859569, pvalue=0.82474031108198065),
 Power_divergenceResult(statistic=0.34161186163819346, pvalue=0.55890029922965501)]

In [None]:
# clearly the significance levels are very high (higher than the 0.05 cutoff)

# This goes to say that there is higher possibility that the observation was by chance

# so studying just for the higher value questions might not work as expected