In [71]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd
import os
import re
from string import punctuation
from datetime import datetime

english_stopwords = set(stopwords.words('english') + list(punctuation) +
                        ['..','...','....','``',"''",'//n'])

First I import all of the necessary libraries to train a naive bayes classifier, and I defined english_stopwords to be the english stopwords from the nltk.corpus library as well as puncuation and a few extra characters.

In [26]:
df = pd.read_json('sample_data/jeopardy.json')

Then, I read in the JSON file to a pandas dataframe.

In [27]:
df.head()

Unnamed: 0,category,air_date,question,value,answer,round,show_number
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680


In [41]:
def high_low(value):
  if value == None:
    return 1
  value = int(value[1:].replace(',',''))
  if value <= 1000:
    return 0
  else:
    return 1

I defined a function to classify each question worth 1000 dollars or less as "low value" or "0", and questions worth greater than 1000 dollars as "high value" or "1". Final Jeopary questions (with value None) are defined as "high value" as well.

In [43]:
df['high_low'] = df['value'].apply(high_low)

Then, I defined a new column called "high_low" and applied the function to it.

In [45]:
df.tail()

Unnamed: 0,category,air_date,question,value,answer,round,show_number,high_low
216925,RIDDLE ME THIS,2006-05-11,'This Puccini opera turns on the solution to 3...,$2000,Turandot,Double Jeopardy!,4999,1
216926,"""T"" BIRDS",2006-05-11,'In North America this term is properly applie...,$2000,a titmouse,Double Jeopardy!,4999,1
216927,AUTHORS IN THEIR YOUTH,2006-05-11,"'In Penny Lane, where this ""Hellraiser"" grew u...",$2000,Clive Barker,Double Jeopardy!,4999,1
216928,QUOTATIONS,2006-05-11,"'From Ft. Sill, Okla. he made the plea, Arizon...",$2000,Geronimo,Double Jeopardy!,4999,1
216929,HISTORIC NAMES,2006-05-11,'A silent movie title includes the last name o...,,Grigori Alexandrovich Potemkin,Final Jeopardy!,4999,1


In [65]:
lemmatizer = WordNetLemmatizer()

def lem_text(text):
  t_list = word_tokenize(text)
  lem_list = [lemmatizer.lemmatize(word) for word in t_list if word not in english_stopwords]
  final_list = ' '.join(lem_list)
  return final_list

Next, I defined a function to tokenize the text from the question column. Then, I lemmatize the words in the word list created from tokenizing. Finally, I join the list of lemmatized words and return this list.

In [63]:
df['lemmatized_Qs'] = df['question'].apply(lem_text)

I defined a new column called "lemmatized_Qs" and applied my lemmatizor function to the question column to return a new column with lemmatized words instead of the questions.

In [66]:
df.head()

Unnamed: 0,category,air_date,question,value,answer,round,show_number,high_low,lemmatized_Qs
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680,0,'For last 8 year life Galileo house arrest esp...
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680,0,'No 2 1912 Olympian football star Carlisle Ind...
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680,0,"'The city Yuma state record average 4,055 hour..."
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680,0,'In 1963 live The Art Linkletter Show company ...
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680,0,'Signer Dec. Indep. framer Constitution Mass. ...


In [67]:
X_train, X_test, Y_train, Y_test = train_test_split(df.lemmatized_Qs, df.high_low, random_state=1)

In [68]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tf = tfidf_vectorizer.fit_transform(X_train)
X_test_tf = tfidf_vectorizer.transform(X_test)

In [69]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tf, Y_train)
predictions = naive_bayes.predict(X_test_tf)

In [70]:
print('Accuracy: ', accuracy_score(Y_test, predictions))

Accuracy:  0.7985912636217801


To finish creating the classifier, I passed the lemmatized_Qs column as the X_train and X_test variables, and I passed the high_low binary column as the Y_train and Y_test variables. Then, I used TfidfVectorizer to fit the vectorizer to the training data and transform the test data. Finally, I fit a Multinomial Naive Bayes classifier to the training data and predicted an accuracy score of about 0.798.