In [74]:
# Jeopardy Questions
# work with a dataset of Jeopardy questions to figure out some patterns in the questions that could help you win.
import pandas
jeopardy = pandas.read_csv("jeopardy.csv", encoding = "ISO-8859-1")
print(jeopardy.head())

   Show Number    Air Date      Round                         Category  Value  \
0         4680  12/31/2004  Jeopardy!                          HISTORY  $200    
1         4680  12/31/2004  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES  $200    
2         4680  12/31/2004  Jeopardy!      EVERYBODY TALKS ABOUT IT...  $200    
3         4680  12/31/2004  Jeopardy!                 THE COMPANY LINE  $200    
4         4680  12/31/2004  Jeopardy!              EPITAPHS & TRIBUTES  $200    

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  


In [75]:
# Print out the columns of jeopardy using jeopardy.columns.
print(jeopardy.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [76]:
# Remove the spaces in each item in jeopardy.columns
# Assign the result back to jeopardy.columns to fix the column names in jeopardy
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value','Question', 'Answer']

In [77]:
# Normalizing Text
# Write a function to normalize questions and answers
# Convert the string to lowercase and remove all punctuation in the string.
import re

def normalized_text(string):
    string = string.lower()
    # ^ matches the start of the string
    # \s matches Unicode whitespace characters (which includes [ \t\n\r\f\v], and also many other characters
    string = re.sub("[^A-Za-z0-9\s]","", string)
    return string

In [78]:
# Normalize the Question column
jeopardy["clean_question"] = jeopardy["Question"].apply(normalized_text)

# Normalize the Answer column
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalized_text)

In [79]:
# Normalizing Columns
# Write a function to normalize dollar values.
# Remove any punctuation in the string, convert the string to an integer.
# If the conversion has an error, assign 0 instead
def normalized_value(string):
    string = re.sub("[^A-Za-z0-9\s]","", string)
    try:
        string = int(string)
    except Exception:
        string = 0
    return string

In [80]:
# Normalize the Value column.
jeopardy["clean_value"] = jeopardy["Value"].apply(normalized_value)

In [81]:
# Use the pandas.to_datetime function to convert the Air Date column to a datetime column.
jeopardy["Air Date"] = pandas.to_datetime(jeopardy["Air Date"])

In [82]:
jeopardy

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant,in the title of an aesop fable this insect sha...,the ant,200
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way,built in 312 bc to link rome the south of ita...,the appian way,400
7,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$400,"No. 8: 30 steals for the Birmingham Barons; 2,...",Michael Jordan,no 8 30 steals for the birmingham barons 2306 ...,michael jordan,400
8,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$400,"In the winter of 1971-72, a record 1,122 inche...",Washington,in the winter of 197172 a record 1122 inches o...,washington,400
9,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$400,This housewares store was named for the packag...,Crate & Barrel,this housewares store was named for the packag...,crate barrel,400


In [83]:
# check the datatype for each column
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

In [84]:
# How often the answer is deducible from the question.
# We can answer the question by seeing how many times words in the answer also occur in the question. 

# Write a function that takes in a row in jeopardy, as a Series. It should:
def answer_deductible(row):
    # Split the clean_answer column on the space character (), and assign to the variable split_answer.
    split_answer = row["clean_answer"].split(" ")
    # Split the clean_question column on the space character (), and assign to the variable split_question.
    split_question = row["clean_question"].split(" ")
    # If "the" is in split_answer, remove it using the remove method on lists. "The" is commonly found in answers and questions, but doesn't have any meaningful use in finding the answer.
    if "the" in split_answer:
        split_answer.remove("the")
    # If the length of split_answer is 0, return 0. This prevents a division by zero error later.
    if len(split_answer) == 0:
        return 0
    # Create a variable called match_count, and set it to 0.
    match_count = 0
    # Loop through each item in split_answer, and see if it occurs in split_question. If it does, add 1 to match_count.
    for item in split_answer:
        if item in split_question:
            match_count += 1
    # Divide match_count by the length of split_answer, and return the result.
    result = match_count / len(split_answer)
    return result

In [85]:
# Count how many times terms in clean_answer occur in clean_question
jeopardy["answer_in_question"] = jeopardy.apply(answer_deductible, axis=1)
print(jeopardy.head())

   Show Number   Air Date      Round                         Category  Value  \
0         4680 2004-12-31  Jeopardy!                          HISTORY  $200    
1         4680 2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES  $200    
2         4680 2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...  $200    
3         4680 2004-12-31  Jeopardy!                 THE COMPANY LINE  $200    
4         4680 2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES  $200    

                                            Question      Answer  \
0  For the last 8 years of his life, Galileo was ...  Copernicus   
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe   
2  The city of Yuma in this state has a record av...     Arizona   
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's   
4  Signer of the Dec. of Indep., framer of the Co...  John Adams   

                                      clean_question clean_answer  \
0  for the last 8 years of his life galil

In [86]:
# Find the mean of the answer_in_question column using the mean method on Series.
mean_answer_inquestion = jeopardy["answer_in_question"].mean()
print(mean_answer_inquestion)

0.0593714165298


Answer terms in the question

The answer only appears in the question about 6% of the time. This isn't a huge number, and means that we probably can't just hope that hearing a question will enable us to figure out the answer. We'll probably have to study.

In [87]:
# Recycled Questions
# investigate how often new questions are repeats of older ones.

# Sort jeopardy in order of ascending air date.
jeopardy.sort_values("Air Date", ascending = True)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question
84523,1,1984-09-10,Jeopardy!,LAKES & RIVERS,$100,River mentioned most often in the Bible,the Jordan,river mentioned most often in the bible,the jordan,100,0.000000
84565,1,1984-09-10,Double Jeopardy!,THE BIBLE,"$1,000","According to 1st Timothy, it is the ""root of a...",the love of money,according to 1st timothy it is the root of all...,the love of money,1000,0.333333
84566,1,1984-09-10,Double Jeopardy!,'50'S TV,"$1,000",Name under which experimenter Don Herbert taug...,Mr. Wizard,name under which experimenter don herbert taug...,mr wizard,1000,0.000000
84567,1,1984-09-10,Double Jeopardy!,NATIONAL LANDMARKS,"$1,000",D.C. building shaken by November '83 bomb blast,the Capitol,dc building shaken by november 83 bomb blast,the capitol,1000,0.000000
84568,1,1984-09-10,Double Jeopardy!,NOTORIOUS,"$1,000","After the deed, he leaped to the stage shoutin...",John Wilkes Booth,after the deed he leaped to the stage shouting...,john wilkes booth,1000,0.000000
84569,1,1984-09-10,Double Jeopardy!,4-LETTER WORDS,"$1,000",The president takes one before stepping into o...,oath,the president takes one before stepping into o...,oath,1000,0.000000
84570,1,1984-09-10,Final Jeopardy!,HOLIDAYS,,The third Monday of January starting in 1986,Martin Luther King Day,the third monday of january starting in 1986,martin luther king day,0,0.000000
84538,1,1984-09-10,Jeopardy!,LAKES & RIVERS,$400,American river only 33 miles shorter than the ...,the Missouri,american river only 33 miles shorter than the ...,the missouri,400,0.000000
84537,1,1984-09-10,Jeopardy!,ACTORS & ROLES,$300,"He may ""Never Say Never Again"" when asked to b...",Sean Connery,he may never say never again when asked to be ...,sean connery,300,0.000000
84536,1,1984-09-10,Jeopardy!,FOREIGN CUISINE,$300,Jewish crepe filled with cheese,a blintz,jewish crepe filled with cheese,a blintz,300,0.000000


Below step enable you to check if the terms in questions have been used previously or not. 
Only looking at words greater than 6 characters enables you to filter out words like the and than, 
which are commonly used, but don't tell you a lot about a question.

In [88]:
# Maintain a set called terms_used that will be empty initially
terms_used = set()
# Create an empty list called question_overlap
question_overlap = []

# Iterate through each row of jeopardy.
# DataFrame.iterrows() -> Iterate over DataFrame rows as (index, Series) pairs.
for i, row in jeopardy.iterrows():
    # Split clean_question into words, remove any word shorter than 6 characters, and check if each word occurs in terms_used.
    split_question = row["clean_question"].split(" ")
    split_question = [i for i in split_question if len(i) > 5]
    # Set counter to 0
    morethan6charc_count = 0
    for word in split_question:
        # If it does, increment a counter.
        if word in terms_used:
            morethan6charc_count += 1
    # add each word to terms_used. (set.add() can add element to set data type)
    for word in split_question:
        terms_used.add(word)
    # If the length of split_question is greater than 0, divide match_count by the length of split_question.
    if len(split_question) > 0:
        morethan6charc_count = morethan6charc_count / len(split_question)
    # Append match_count to question_overlap.
    question_overlap.append(morethan6charc_count)

In [89]:
# Assign question_overlap to the question_overlap column of jeopardy
jeopardy["question_overlap"] = question_overlap

# Find the mean of the question_overlap column and print it.
mean_questionoverlap = jeopardy["question_overlap"].mean()
print(mean_questionoverlap)

0.873511019215


Question overlap

There is about 87% overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases, it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling of questions.

In [90]:
# Low Value Vs High Value Questions
# you only want to study questions that pertain to high value questions instead of low value questions.
# We can figure out which terms correspond to high-value questions using a chi-squared test
# First, narrow down the questions into two categories
# Then loop through each of the terms from the last screen, terms_used
"""
1. Find the number of low value questions the word occurs in.
2. Find the number of high value questions the word occurs in.
3. Find the percentage of questions the word occurs in.
4. Based on the percentage of questions the word occurs in, find expected counts.
5. Compute the chi squared value based on the expected counts and the observed counts for high and low value questions.
"""

# Create a function that takes in a row from a Dataframe
def value_group(row):
    # If the clean_value column is greater than 800, assign 1 to value.
    if row["clean_value"] > 800:
        value = 1
    # Otherwise, assign 0 to value.
    else:
        value = 0
    # Return value.
    return value

jeopardy["high_value"] = jeopardy.apply(value_group, axis=1)

In [91]:
# Create a function that takes in a word
def group_word_count(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        # If the word is in the split question
        if word in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [92]:
observed_expected = []

# Convert terms_used into a list using the list function, and assign the first 5 elements to comparison_terms.
comparison_terms = list(terms_used)[:5]

In [93]:
# Loop through each term in comparison_terms, and:
# Run the function on the term to get the high value and low value counts.
# Append the result of running the function (which will be a list) to observed_expected.
for term in comparison_terms:
    observed_expected.append(group_word_count(term))
    
observed_expected

[(0, 1), (0, 1), (0, 1), (0, 1), (33, 83)]

In [94]:
# Applying The Chi-Squared Test

# Find the number of rows in jeopardy where high_value is 1
high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]

# Find the number of rows in jeopardy where high_value is 0
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

In [96]:
import numpy as np
from scipy.stats import chisquare

chi_squared = []

# Loop through each list in observed_expected
for lists in observed_expected:
    # Add up both items in the list (high and low counts) to get the total count, and assign to total.
    total = sum(lists)
    # Divide total by the number of rows in jeopardy to get the proportion across the dataset. Assign to total_prop.
    total_prop = total / jeopardy.shape[0]
    # Multiply total_prop by high_value_count to get the expected term count for high value rows.
    total_high_prop = total_prop * high_value_count
    # Multiply total_prop by low_value_count to get the expected term count for low value rows.
    total_low_prop = total_prop * low_value_count
    
    # Use the scipy.stats.chisquare function to compute the chi-squared value and p-value given the expected and observed counts.
    observed = np.array([lists[0], lists[1]])
    expected = np.array([total_high_prop, total_low_prop])
    chi_squared.append(chisquare(observed, expected))
    # Append the results to chi_squared.

chi_squared

[Power_divergenceResult(statistic=0.39497646423335131, pvalue=0.52969509124866954),
 Power_divergenceResult(statistic=0.39497646423335131, pvalue=0.52969509124866954),
 Power_divergenceResult(statistic=0.39497646423335131, pvalue=0.52969509124866954),
 Power_divergenceResult(statistic=0.39497646423335131, pvalue=0.52969509124866954),
 Power_divergenceResult(statistic=0.0010273158506472991, pvalue=0.97443076695468012)]

Chi-squared results

None of the terms had a significant difference in usage between high value and low value rows (pvalue all greater than 0.05). Additionally, the frequencies were lower than 5 for 4 of the terms, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies.