In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import spacy
#loading the english language small model of spacy
en = spacy.load('en_core_web_sm')
STOPWORDS = en.Defaults.stop_words
import pandas as pd
import numpy as np
import re
import string
import csv
import json

import seaborn as sns
import matplotlib.pyplot as plt

## Loading and viewing the Jeopardy Question CSV data

In [2]:
jq = pd.read_csv('JEOPARDY_CSV.csv', dtype='string')
jq.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Show Number  216930 non-null  string
 1    Air Date    216930 non-null  string
 2    Round       216930 non-null  string
 3    Category    216930 non-null  string
 4    Value       216930 non-null  string
 5    Question    216930 non-null  string
 6    Answer      216928 non-null  string
dtypes: string(7)
memory usage: 11.6 MB


In [4]:
jq.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [5]:
jq.shape

(216930, 7)

## Brief prototyping for dividing each question answer pair into a list of lists

In [6]:
df_slice = jq[[' Category', ' Value', ' Question', ' Answer']]
docs = [list(row) for row in df_slice.values]
docs

[['HISTORY',
  '$200',
  "For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory",
  'Copernicus'],
 ["ESPN's TOP 10 ALL-TIME ATHLETES",
  '$200',
  'No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves',
  'Jim Thorpe'],
 ['EVERYBODY TALKS ABOUT IT...',
  '$200',
  'The city of Yuma in this state has a record average of 4,055 hours of sunshine each year',
  'Arizona'],
 ['THE COMPANY LINE',
  '$200',
  'In 1963, live on "The Art Linkletter Show", this company served its billionth burger',
  "McDonald's"],
 ['EPITAPHS & TRIBUTES',
  '$200',
  'Signer of the Dec. of Indep., framer of the Constitution of Mass., second President of the United States',
  'John Adams'],
 ['3-LETTER WORDS',
  '$200',
  'In the title of an Aesop fable, this insect shared billing with a grasshopper',
  'the ant'],
 ['HISTORY',
  '$400',
  "Built in 312 B.C. to link Rome & the South of Italy, it's still in use today

In [7]:
wtf = []
tks = []
for doc in docs:
    ldf = re.sub('[%s]' % re.escape(string.punctuation), '', str(doc))
    hldf = re.sub('/<\/?[\w\s]*>|<.+[\W]>/g', ' ', ldf)
    wtf.append(hldf)
for txt in wtf:
    token = word_tokenize(txt)
    tks.append(token)
    

## Preprocessing the dataframe

Punctuation and html tags were removed from the dataset. Some more cleaning needs to be done. Docs with direct links to either audio or 
video is not interpretable by current NLP methods, so they need to be removed. Words with repeated letters or strings of numbers 
need to be removed., such as 'zz' or '000'. Common stop words and other words that are ubiquitous in jeopardy questions also need to be removed.

In [10]:
def saver_json(filename):
    '''Takes the output of the preprocessor function and saves it as a json file with the specified filename.'''
    if tks:
        with open(filename, "w") as outfile:
            json.dump(tks, outfile)
    return 

In [32]:
def preprocess(df, column_lst, filename, save = True):
    '''Processing the Jeopardy question data by selecting only the category, value, question, and answer. 
    Regular expression is used to remove all punctuation and leftover html tags. 
    Then the corpus was tokenized using nltk word_tokenizer and saved as a json file.'''
    #clean up data more by removing repeated 2 letters and repeated numbers
    global tks
    clean = []
    tks = []
    df_slice = df[column_lst]
    docs = [list(row) for row in df_slice.values]
    for doc in docs:
        #removes the punctuation
        ldf = re.sub('[%s]' % re.escape(string.punctuation), '', str(doc).lower())
        #removes any leftover html tags
        hldf = re.sub('/<\/?[\w\s]*>|<.+[\W]>/g', ' ', ldf)
        clean.append(hldf)
    for c in clean:
        if c not in STOPWORDS:
            tokens = word_tokenize(c)
            tks.append(tokens)
    if save and filename:
        saver_json(filename)
    return tks


In [33]:
preprocess(jq, column_lst = [' Category', ' Question', ' Answer'], filename='jq.json')

[['history',
  'for',
  'the',
  'last',
  '8',
  'years',
  'of',
  'his',
  'life',
  'galileo',
  'was',
  'under',
  'house',
  'arrest',
  'for',
  'espousing',
  'this',
  'mans',
  'theory',
  'copernicus'],
 ['espns',
  'top',
  '10',
  'alltime',
  'athletes',
  'no',
  '2',
  '1912',
  'olympian',
  'football',
  'star',
  'at',
  'carlisle',
  'indian',
  'school',
  '6',
  'mlb',
  'seasons',
  'with',
  'the',
  'reds',
  'giants',
  'braves',
  'jim',
  'thorpe'],
 ['everybody',
  'talks',
  'about',
  'it',
  'the',
  'city',
  'of',
  'yuma',
  'in',
  'this',
  'state',
  'has',
  'a',
  'record',
  'average',
  'of',
  '4055',
  'hours',
  'of',
  'sunshine',
  'each',
  'year',
  'arizona'],
 ['the',
  'company',
  'line',
  'in',
  '1963',
  'live',
  'on',
  'the',
  'art',
  'linkletter',
  'show',
  'this',
  'company',
  'served',
  'its',
  'billionth',
  'burger',
  'mcdonalds'],
 ['epitaphs',
  'tributes',
  'signer',
  'of',
  'the',
  'dec',
  'of',
  'inde

In [13]:
tks

[['HISTORY',
  'For',
  'the',
  'last',
  '8',
  'years',
  'of',
  'his',
  'life',
  'Galileo',
  'was',
  'under',
  'house',
  'arrest',
  'for',
  'espousing',
  'this',
  'mans',
  'theory',
  'Copernicus'],
 ['ESPNs',
  'TOP',
  '10',
  'ALLTIME',
  'ATHLETES',
  'No',
  '2',
  '1912',
  'Olympian',
  'football',
  'star',
  'at',
  'Carlisle',
  'Indian',
  'School',
  '6',
  'MLB',
  'seasons',
  'with',
  'the',
  'Reds',
  'Giants',
  'Braves',
  'Jim',
  'Thorpe'],
 ['EVERYBODY',
  'TALKS',
  'ABOUT',
  'IT',
  'The',
  'city',
  'of',
  'Yuma',
  'in',
  'this',
  'state',
  'has',
  'a',
  'record',
  'average',
  'of',
  '4055',
  'hours',
  'of',
  'sunshine',
  'each',
  'year',
  'Arizona'],
 ['THE',
  'COMPANY',
  'LINE',
  'In',
  '1963',
  'live',
  'on',
  'The',
  'Art',
  'Linkletter',
  'Show',
  'this',
  'company',
  'served',
  'its',
  'billionth',
  'burger',
  'McDonalds'],
 ['EPITAPHS',
  'TRIBUTES',
  'Signer',
  'of',
  'the',
  'Dec',
  'of',
  'Inde

## A topic modeling function
One can specify the type of vectorizer, model, and number of topics with the function below. The topics don't appear very strong, so more data cleaning may need to be done.

In [20]:
def topic_model(filename, model, num_topics=2, v = 'tfidf'):
    '''A topic modeling function where one can open and read the document containing the word corpus. 
    Then you can specify methods to vectorize the corpus, either by count vectorization and TFIDF. 
    The topic modeling algorithm can also be specified as either LSA, NMF, or LDA.'''
    with open(filename, 'r') as pjq:   
        if v == 'tfidf':
            vect = TfidfVectorizer(stop_words='english')
        if v == 'count':
            vect = CountVectorizer(stop_words='english')
        #make doc-term matrix
        tfidf = vect.fit_transform(pjq)
        if model == 'lsa':
            algo = TruncatedSVD(n_components=num_topics)
            topic_term = algo.fit_transform(tfidf)

        if model == 'nmf':
            algo = NMF(n_components=num_topics)
            topic_term = algo.fit_transform(tfidf)

        if model == 'lda':
            #lda is currently broken
            #need to apply standard scalar and preform a train test split
            algo = LDA(n_components=num_topics)
        topic_term = algo.fit_transform(tfidf)
        topic_word = pd.DataFrame(algo.components_.round(3),
                 columns = vect.get_feature_names())
        for topic in range(topic_word.shape[0]):
            t10 = topic_word.iloc[topic]
            print(f'For topic {topic+1} the words with the highest value are:')
            print(t10.nlargest(10))
            print('\n')
        return

In [22]:
for n in range(2, 11, 1):
    topic_model(filename='jq.json', model='nmf', num_topics=n)

  return np.sqrt(res * 2)


For topic 1 the words with the highest value are:
city        8.852
new         8.630
world       8.249
state       7.863
like        6.710
man         6.682
named       6.668
country     6.464
film        6.441
american    6.328
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
feels              0.062
saved              0.059
targetblankseen    0.058
ratio              0.056
kenny              0.054
kubrick            0.053
joel               0.051
amino              0.049
khartoum           0.049
racket             0.048
Name: 1, dtype: float64




  return np.sqrt(res * 2)


For topic 1 the words with the highest value are:
city        21.308
new         20.799
world       19.864
state       18.922
like        16.168
man         16.078
named       16.054
country     15.569
film        15.496
american    15.255
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
crowe          0.044
phd            0.042
sparrow        0.042
album          0.040
grapes         0.040
kathy          0.040
operetta       0.040
winona         0.040
impeachment    0.039
included       0.039
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
bell          0.052
place         0.050
returning     0.050
calvin        0.042
definition    0.042
holly         0.042
mole          0.042
felix         0.041
madam         0.041
nicaragua     0.041
Name: 2, dtype: float64




  return np.sqrt(res * 2)


For topic 1 the words with the highest value are:
city        12.098
new         11.802
world       11.286
state       10.752
like         9.163
man          9.132
named        9.105
country      8.852
film         8.803
american     8.661
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
run          0.041
showing      0.039
xv           0.039
acropolis    0.037
entire       0.037
verdi        0.037
difficult    0.036
hamilton     0.036
properly     0.036
digital      0.035
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
harpers        0.037
railroads      0.037
rijksmuseum    0.037
bobs           0.036
waterston      0.036
ee             0.035
stevens        0.035
captured       0.034
grandpa        0.034
jus            0.034
Name: 2, dtype: float64


For topic 4 the words with the highest value are:
em           0.036
grumpy       0.035
bowler       0.034
communist    0.034
goldsmith    0.034
grants       0.034
irons        0.034

  return np.sqrt(res * 2)


For topic 1 the words with the highest value are:
city        20.380
new         19.879
world       19.018
state       18.110
like        15.468
named       15.381
man         15.378
country     14.927
film        14.791
american    14.587
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
civil           0.033
espn            0.033
playoff         0.033
trombone        0.032
baptist         0.031
dark            0.031
hodge           0.031
kennel          0.031
measurements    0.030
oped            0.030
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
intermediate    0.033
modest          0.033
oscars          0.033
ss              0.032
accepting       0.031
calhoun         0.031
clash           0.031
creator         0.031
josh            0.031
nw              0.031
Name: 2, dtype: float64


For topic 4 the words with the highest value are:
bread        0.033
forms        0.033
1610         0.032
1853         0.031
nice         0.0



For topic 1 the words with the highest value are:
city        10.896
new         10.632
world       10.160
state        9.686
like         8.260
man          8.224
named        8.217
country      7.983
film         7.940
american     7.796
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
30th           0.036
fda            0.030
swaziland      0.030
wooden         0.030
7th            0.029
like           0.029
zion           0.029
bosworth       0.028
kilimanjaro    0.028
longest        0.028
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
buckle       0.038
mercedes     0.037
prevented    0.033
joke         0.032
baseman      0.031
judd         0.031
fat          0.030
gulf         0.030
stops        0.030
extremes     0.029
Name: 2, dtype: float64


For topic 4 the words with the highest value are:
connected        0.034
kong             0.033
fruitcake        0.031
physics          0.031
seine            0.031
policemen        

  return np.sqrt(res * 2)


For topic 1 the words with the highest value are:
city        6.869
new         6.701
world       6.409
state       6.101
like        5.211
man         5.186
named       5.180
country     5.032
film        5.004
american    4.922
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
junction      0.031
ahead         0.030
tendency      0.030
major         0.029
shoe          0.028
frying        0.026
topper        0.026
aisle         0.025
attempts      0.025
melancholy    0.025
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
seuss        0.031
verdi        0.031
19th         0.030
french       0.030
mythology    0.029
blossom      0.027
dem          0.027
mork         0.027
pepper       0.027
cut          0.026
Name: 2, dtype: float64


For topic 4 the words with the highest value are:
tragically    0.032
solution      0.030
turkish       0.030
caan          0.029
small         0.028
bought        0.027
indicated     0.027
sydney      

  return np.sqrt(res * 2)


For topic 1 the words with the highest value are:
city        5.837
new         5.692
world       5.439
state       5.184
like        4.428
man         4.406
named       4.399
country     4.271
film        4.245
american    4.178
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
1861          0.028
te            0.027
depth         0.025
exercises     0.025
groove        0.025
nanny         0.025
dramatized    0.024
easily        0.024
estate        0.024
walden        0.024
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
innovation    0.028
costello      0.027
migrations    0.027
colombian     0.026
discharge     0.026
emblem        0.026
shows         0.026
exercises     0.025
piano         0.025
1885          0.024
Name: 2, dtype: float64


For topic 4 the words with the highest value are:
ogden          0.032
meant          0.029
highland       0.027
treaty         0.027
beasts         0.025
brazils        0.025
cesare         0

  return np.sqrt(res * 2)


For topic 1 the words with the highest value are:
city        9.306
new         9.078
world       8.679
state       8.262
like        7.062
man         7.022
named       7.020
country     6.823
film        6.777
american    6.669
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
fireworks     0.025
oblong        0.025
sultry        0.025
antonyms      0.024
containing    0.024
facilities    0.024
eden          0.023
federal       0.023
gothic        0.023
nectar        0.023
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
rhythms       0.028
alcatraz      0.027
kidman        0.027
redford       0.027
fabled        0.026
hertz         0.026
larger        0.026
manifesto     0.026
kalahari      0.025
passengers    0.025
Name: 2, dtype: float64


For topic 4 the words with the highest value are:
clive         0.027
literature    0.026
destroyer     0.025
conductors    0.024
ederle        0.024
golfing       0.024
iceberg       0.024
be



For topic 1 the words with the highest value are:
city        19.760
new         19.256
world       18.418
state       17.558
like        14.977
named       14.905
man         14.894
country     14.472
film        14.382
american    14.143
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
indianapolis    0.027
consecutive     0.026
processing      0.024
crickets        0.023
rye             0.023
tubes           0.023
arkansas        0.022
bismarck        0.022
funnies         0.022
wynton          0.022
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
appeared         0.026
lech             0.025
suspicion        0.025
alda             0.024
antarcticaa      0.024
education        0.024
pins             0.024
speeches         0.024
congresswoman    0.023
duckling         0.022
Name: 2, dtype: float64


For topic 4 the words with the highest value are:
capone       0.027
wheels       0.027
lisa         0.026
greetings    0.025
sting 

In [23]:
for n in range(2, 11, 1):
    topic_model(filename='jq.json', model='lsa', num_topics=n)

  self.explained_variance_ratio_ = exp_var / full_var


For topic 1 the words with the highest value are:
city        0.170
new         0.166
world       0.159
state       0.151
like        0.129
man         0.129
named       0.129
country     0.125
film        0.124
american    0.122
Name: 0, dtype: float64


For topic 1 the words with the highest value are:
city        0.170
new         0.166
world       0.159
state       0.151
like        0.129
man         0.129
named       0.129
country     0.125
film        0.124
american    0.122
Name: 0, dtype: float64


For topic 1 the words with the highest value are:
city        0.170
new         0.166
world       0.159
state       0.151
like        0.129
man         0.129
named       0.129
country     0.125
film        0.124
american    0.122
Name: 0, dtype: float64


For topic 1 the words with the highest value are:
city        0.170
new         0.166
world       0.159
state       0.151
like        0.129
man         0.129
named       0.129
country     0.125
film        0.124
american    0.122
Na

In [24]:
for n in range(2, 11, 1):
    topic_model(filename='jq.json', model='lda', num_topics=n)

TypeError: fit() missing 1 required positional argument: 'y'