In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag

import pandas as pd
import numpy as np
import re
import string
import csv
import json

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
jq = pd.read_csv('JEOPARDY_CSV.csv', dtype='string')
jq.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jq['Q&A'] = jq[' Question'].str.cat(jq[' Answer'], sep=" ")

In [4]:
def cleaner(docs):
    '''Cleans the Jeopardy question data. It takes in a list form of a slice of the corpus's 
    dataframe. Regular expression is then used to remove all hyperlinks, punctuation, 
    and leftover html tags.'''
    #clean up data more by removing repeated 2 letters and repeated numbers
    global clean_txts
    clean_txts = []
    tks = []
    for doc in docs:
        #removes the a tags and the hyperlinks that are inbetween them 
        no_links = re.sub('<a[^>]*>(.*?)(</a>|)', '', str(doc).lower(), flags=re.MULTILINE)
        #removes the punctuation
        ldf = re.sub('[%s]' % re.escape(string.punctuation), '', no_links)
        #removes any leftover html tags
        hldf = re.sub('/<\/?[\w\s]*>|<.+[\W]>/g', ' ', ldf)
        clean_txts.append(hldf)
    return clean_txts   


In [5]:
def stemmer(tokens):
    '''Uses NLTK's PorterStemmer to stem the tokens made by the tokenizer function '''
    global stemz
    stemz = []
    ps = PorterStemmer()
    for tk in tokens:
        stemz.append(ps.stem(str(tk)))
    return stemz

In [6]:
def lemmatizer(tokens):
    '''Uses NLTK's WordNetLemmatizer to lemmatize the tokens made by the tokenizer function '''
    global lemmaz
    lemmaz = []
    wn_lm = WordNetLemmatizer()
    for tk in tokens:
        lemmaz.append(wn_lm.lemmatize(str(tk)))
    return lemmaz

In [7]:
def tokenizer(clean_txt):
    global tks
    '''Tokenizes the cleaned text corpus via NLTK's word tokenizer'''
    tks =[]
    for txt in clean_txts:
        tks.append(word_tokenize(txt))
    return tks

In [8]:
def stop_dese_wrds(tks, stopwords, add_stopwords = None):
    '''This function removes stop_words from the pre-tokenized corpus.
    It uses NLTK's stop_wods as a default, but you are given the option
    to add more stop words if needed via the add_stopwords variable. '''
    if add_stopwords != None:
        stopwords.extend(add_stopwords)
    for sent in tks:
        for word in sent:
            if word in stopwords:
                sent.remove(word)
    return tks

In [9]:
def saver_json(filename, data):
    '''Takes the output of the preprocessor function and saves it as a json file with the specified filename.'''
    if filename:
        with open(filename, "w") as outfile:
            json.dump(data, outfile)
    return

In [10]:
def processor(docs, filename, stopwords, add_stopwords = None, stem = False, lemma = False, save = True):
    '''Combines all the work of the smaller functions to create a pipeline of cleaning,
    tokenizing, removing stop words, and saving the newly processed corpus as a json.
    One is given the option to either stem or lemmatize the data, but the default is to
    only tokenize and remove stop words.'''
    cleaner(docs)
    tokenizer(clean_txts)
    stop_dese_wrds(tks, stopwords, add_stopwords=add_stopwords)
    if stem:
        stemmer(tks)
        data = stemz
    if lemma:
        lemmatizer(tks)
        data = lemmaz
    else:
        data = tks
    if filename and save:
        saver_json(filename, data)
    return

In [11]:
docs = jq['Q&A'].to_list()
f_og = 'jq.json'
f_lemma = 'jq_lemma.json'
f_stem = 'jq_stem.json'
stop_words = stopwords.words('english')
add = ['like', 'im', 'seen', 'named', 'called', 'title', 'home', 'little', 'comes', 'type', 'said', 'used', 'known', 'dont', 'hes', 'youre', 
      'alright', 'join','big', 'means', 'make', 'comes', 'look', 'use', 'come', 'small', 'clue', 'play', 'played', 'long', 'crew', 'wrote', 
       'dose', 'word', 'cleaned', 'know', 'felt', 'arent', 'cant', 'couldnt', 'didnt', 'doesnt',
      'dont', 'hadnt', 'hasnt', 'havent', 'hed', 'hes', 'heres', 'id', 'ive', 'isnt', 'lets', 'musnt',
      'shant', 'shouldnt', 'shes', 'theres', 'theyd', 'theyll', 'theyre', 'theyve', 'were', 'weve', 'werent'
      'whats', 'whens', 'wheres', 'whos', 'whys', 'wont', 'youd', 'youll', 'youre', 'youve']

In [12]:
stop_words.extend(add)

In [13]:
processor(docs, f_og, stop_words)

In [14]:
processor(docs, f_lemma, stop_words, lemma=True)

In [15]:
processor(docs, f_stem, stop_words, stem=True)

In [16]:
def topic_model(filename, model, num_topics=2, v = 'tfidf'):
    '''A topic modeling function where one can open and read the document containing the word corpus. 
    Then you can specify methods to vectorize the corpus, either by count vectorization and TFIDF. 
    The topic modeling algorithm can also be specified as either LSA, NMF, or LDA.'''
    with open(filename, 'r') as pjq:   
        if v == 'tfidf':
            doc_word = TfidfVectorizer(stop_words='english', max_df=1, min_df=0.15)
        if v == 'count':
            doc_word = CountVectorizer(stop_words='english')
        #make doc-term matrix
        vectors = doc_word.fit_transform(pjq)
        if model == 'lsa':
            algo = TruncatedSVD(n_components=num_topics)

        if model == 'nmf':
            algo = NMF(n_components=num_topics, init='random', random_state=4)

        #if model == 'lda':
            #lda is currently broken
            #need to apply standard scalar and preform a train test split
            #algo = LDA(n_components=num_topics)
        
        doc_topic = algo.fit_transform(vectors)
        topic_word = pd.DataFrame(algo.components_.round(3),
                 columns = doc_word.get_feature_names())
        #print(doc_word)
        #print(doc_topic)
        #print(topic_word)
        for topic in range(topic_word.shape[0]):
            t10 = topic_word.iloc[topic]
            print(f'For topic {topic+1} the words with the highest value are:')
            print(t10.nlargest(20))
            print('\n')
        return

In [17]:
def pos_filter(filename):
    with open(filename) as corpus:
        loaded_c = json.load(corpus)
        for doc in loaded_c:
            pos_corpus = pos_tag(doc)
        pos_filtered = [c for c in pos_corpus if c[1] == 'NN' or c[1] == 'JJ' ]
        return pos_filtered[:10]

In [28]:
topic_model(f_lemma, model = 'nmf')

For topic 1 the words with the highest value are:
city         8.882
new          7.751
man          6.800
state        6.271
country      6.207
film         5.990
john         5.606
crew         4.866
king         4.308
years        4.125
war          4.091
american     4.056
world        3.947
president    3.881
herea        3.772
novel        3.752
term         3.672
island       3.604
capital      3.509
french       3.386
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
cohan         0.069
cather        0.063
lower         0.061
brunswick     0.060
isotope       0.059
swede         0.058
akc           0.056
dwarfs        0.056
koran         0.056
shift         0.056
challenger    0.055
sammy         0.055
voltaire      0.055
continuous    0.054
dressing      0.054
envelope      0.054
wheels        0.054
ferrer        0.053
monterey      0.053
phosphorus    0.053
Name: 1, dtype: float64




In [18]:
topic_model(f_lemma, model = 'nmf', num_topics=10)

  return np.sqrt(res * 2)


For topic 1 the words with the highest value are:
city         17.413
new          15.189
man          13.310
state        12.278
country      12.146
film         11.726
john         10.990
crew          9.509
king          8.432
years         8.067
war           7.991
american      7.917
world         7.702
president     7.581
herea         7.362
novel         7.328
term          7.168
island        7.042
capital       6.863
french        6.618
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
cohan         0.031
cather        0.028
brunswick     0.027
lower         0.027
phosphorus    0.027
isotope       0.026
swede         0.026
challenger    0.025
dwarfs        0.025
koran         0.025
sammy         0.025
shift         0.025
voltaire      0.025
weeds         0.025
continuous    0.024
dressing      0.024
ferrer        0.024
monterey      0.024
wheels        0.024
charlotte     0.023
Name: 1, dtype: float64


For topic 3 the words with the highest value are

In [19]:
topic_model(f_lemma, model = 'nmf', num_topics=20)

For topic 1 the words with the highest value are:
city         8.979
new          7.831
man          6.868
state        6.336
country      6.273
film         6.057
john         5.664
crew         4.908
king         4.353
years        4.165
war          4.127
american     4.087
world        3.980
president    3.919
herea        3.802
novel        3.783
term         3.704
island       3.636
capital      3.541
french       3.415
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
cohan         0.022
cather        0.020
brunswick     0.019
isotope       0.019
lower         0.019
phosphorus    0.019
akc           0.018
dwarfs        0.018
koran         0.018
sammy         0.018
shift         0.018
swede         0.018
weeds         0.018
challenger    0.017
continuous    0.017
dressing      0.017
envelope      0.017
ferrer        0.017
monterey      0.017
voltaire      0.017
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
balcony           

In [22]:
topic_model(f_lemma, model = 'nmf', num_topics=30)

  return np.sqrt(res * 2)


For topic 1 the words with the highest value are:
city         3.908
new          3.408
man          2.989
state        2.758
country      2.731
film         2.636
john         2.467
crew         2.137
king         1.894
years        1.813
war          1.797
american     1.781
world        1.733
president    1.705
herea        1.656
novel        1.648
term         1.614
island       1.584
capital      1.543
french       1.486
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
cohan         0.018
cather        0.016
lower         0.016
brunswick     0.015
isotope       0.015
phosphorus    0.015
shift         0.015
swede         0.015
akc           0.014
challenger    0.014
continuous    0.014
dressing      0.014
dwarfs        0.014
envelope      0.014
ferrer        0.014
koran         0.014
monterey      0.014
sammy         0.014
voltaire      0.014
weeds         0.014
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
balcony           

In [23]:
topic_model(f_lemma, model = 'nmf', num_topics=40)

  return np.sqrt(res * 2)


For topic 1 the words with the highest value are:
city         3.990
new          3.481
man          3.052
state        2.815
country      2.788
film         2.692
john         2.518
crew         2.182
king         1.934
years        1.851
war          1.834
american     1.817
world        1.770
president    1.741
herea        1.690
novel        1.682
term         1.647
island       1.617
capital      1.575
french       1.517
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
cohan           0.015
cather          0.014
lower           0.014
phosphorus      0.014
akc             0.013
brunswick       0.013
isotope         0.013
shift           0.013
swede           0.013
weeds           0.013
challenger      0.012
continuous      0.012
conversation    0.012
courses         0.012
dressing        0.012
dwarfs          0.012
envelope        0.012
ferrer          0.012
jackie          0.012
koran           0.012
Name: 1, dtype: float64


For topic 3 the words with t

In [24]:
topic_model(f_lemma, model = 'nmf', num_topics=50)

  return np.sqrt(res * 2)


For topic 1 the words with the highest value are:
city         4.611
new          4.023
man          3.528
state        3.254
country      3.223
film         3.112
john         2.910
crew         2.522
king         2.235
years        2.139
war          2.120
american     2.101
world        2.045
president    2.011
herea        1.953
novel        1.943
term         1.904
island       1.868
capital      1.820
french       1.753
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
cohan         0.014
cather        0.013
brunswick     0.012
isotope       0.012
lower         0.012
swede         0.012
akc           0.011
challenger    0.011
continuous    0.011
dressing      0.011
dwarfs        0.011
envelope      0.011
ferrer        0.011
koran         0.011
monterey      0.011
phosphorus    0.011
sammy         0.011
shift         0.011
voltaire      0.011
weeds         0.011
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
balcony           

In [25]:
topic_model(f_lemma, model = 'nmf', num_topics=75)

For topic 1 the words with the highest value are:
city         3.269
new          2.852
man          2.502
state        2.307
country      2.285
film         2.206
john         2.063
crew         1.789
king         1.585
years        1.517
war          1.503
american     1.490
world        1.451
president    1.426
herea        1.385
novel        1.379
term         1.350
island       1.325
capital      1.291
french       1.243
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
cohan         0.011
brunswick     0.010
cather        0.010
isotope       0.010
lower         0.010
phosphorus    0.010
swede         0.010
akc           0.009
challenger    0.009
continuous    0.009
dressing      0.009
dwarfs        0.009
envelope      0.009
ferrer        0.009
koran         0.009
monterey      0.009
sammy         0.009
shift         0.009
tourism       0.009
voltaire      0.009
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
balcony           

In [26]:
topic_model(f_lemma, model = 'nmf', num_topics=100)

  return np.sqrt(res * 2)


For topic 1 the words with the highest value are:
city         3.112
new          2.715
man          2.381
state        2.197
country      2.176
film         2.100
john         1.964
crew         1.703
king         1.509
years        1.443
war          1.431
american     1.418
world        1.382
president    1.358
herea        1.319
novel        1.312
term         1.285
island       1.261
capital      1.228
french       1.184
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
cohan         0.010
cather        0.009
lower         0.009
phosphorus    0.009
akc           0.008
brunswick     0.008
challenger    0.008
continuous    0.008
dressing      0.008
dwarfs        0.008
envelope      0.008
ferrer        0.008
isotope       0.008
koran         0.008
sammy         0.008
shift         0.008
swede         0.008
voltaire      0.008
weeds         0.008
wheels        0.008
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
balcony           

In [27]:
topic_model(f_lemma, model = 'nmf', num_topics=250)

For topic 1 the words with the highest value are:
city         1.842
new          1.607
man          1.410
state        1.300
country      1.288
film         1.242
john         1.163
crew         1.008
king         0.893
years        0.854
war          0.847
american     0.840
world        0.817
president    0.803
herea        0.781
novel        0.777
term         0.760
island       0.747
capital      0.727
french       0.700
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
cather          0.006
cohan           0.006
akc             0.005
brunswick       0.005
challenger      0.005
charlotte       0.005
christopher     0.005
cleaner         0.005
cleveland       0.005
continuous      0.005
conversation    0.005
courses         0.005
cups            0.005
dressing        0.005
dwarfs          0.005
envelope        0.005
explores        0.005
ferrer          0.005
isotope         0.005
jackie          0.005
Name: 1, dtype: float64


For topic 3 the words with t

In [29]:
topic_model(f_lemma, model = 'nmf', num_topics=90)

  return np.sqrt(res * 2)


For topic 1 the words with the highest value are:
city         2.973
new          2.593
man          2.274
state        2.098
country      2.078
film         2.006
john         1.876
crew         1.626
king         1.441
years        1.379
war          1.367
american     1.355
world        1.319
president    1.297
herea        1.259
novel        1.254
term         1.228
island       1.205
capital      1.173
french       1.131
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
cohan           0.010
brunswick       0.009
cather          0.009
isotope         0.009
lower           0.009
phosphorus      0.009
swede           0.009
akc             0.008
challenger      0.008
charlotte       0.008
christopher     0.008
cleaner         0.008
cleveland       0.008
continuous      0.008
conversation    0.008
courses         0.008
cups            0.008
dressing        0.008
dwarfs          0.008
envelope        0.008
Name: 1, dtype: float64


For topic 3 the words with t

In [20]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [21]:
#display_topics(lsa1, vectorizer1.get_feature_names(), 30)

NameError: name 'lsa1' is not defined

In [None]:
#display_topics(nmf, vectorizer.get_feature_names(), 30)