# Load and Pre-process Data
The flexibility in tuning or controlling these models is slightly limited as compared to Gensim!

In [13]:
import os
import numpy as np
import pandas as pd

DATA_PATH = 'nipstxt1'
print(os.listdir(DATA_PATH))

['0009.txt', '0174.txt', '0184.txt', '0192.txt', '0211.txt', '0223.txt', '0249.txt', '0278.txt', '0290.txt', '0317.txt', '0387.txt', '0422.txt', '0457.txt', '0495.txt', '0642.txt', '0652.txt', '0683.txt', '0693.txt', '0715.txt', '0750.txt', '0830.txt', '0840.txt']


In [14]:
# folders = ["nips{0:02}".format(i) for i in range(0,13)]

# Read all texts into a list.
# Each paper is in its own text file, hence we need to use file-reading functions from Python.

papers = []
# for folder in folders:
file_names = os.listdir(DATA_PATH)
for file_name in file_names:
    with open(DATA_PATH + '/' + file_name, encoding='utf-8', errors='ignore', mode='r+') as f:
        data = f.read()
    papers.append(data)
len(papers)

22

In [15]:
print(papers[0][:1000])

9 
Stochastic Learning Networks and their Electronic Implementation 
Joshua Alspector*, Robert B. Allen, Victor Hut, and Srinagesh Satyanarayana 
Bell Communications Research, Morristown, NJ 07960 
We describe a family of learning algorithms that operate on a recurrent, symmetrically 
connected, neuromorphic network that, like the Boltzmann machine, settles in the 
presence of noise. These networks learn by modifying synaptic connection strengths on 
the basis of correlations seen locally by each synapse. We describe a version of the 
supervised learning algorilhm for a network with analog activation functions. We also 
demonstrate unsupervised competitive learning with this approach, where weight 
saturation and decay play an important role, and describe preliminary experiments in 
reinforcement !earning, where noise is used in the search procedure. We identify the 
above described phenomena as elements that can unify learning techniques at a physical 
microscopic level. 
These algor

## Basic Text Wrangling

In [16]:
%%time
import nltk

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalize_corpus(papers):
    norm_papers = []
    for paper in papers:
        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))
        if paper_tokens:
            norm_papers.append(paper_tokens)
            
    return norm_papers
    
norm_papers = normalize_corpus(papers)
print(len(norm_papers))

22
Wall time: 4.37 s


# Text Representation with Feature Engineering
We represent our text data in the form of a Bag of Words model with uni-grams and bi-grams.

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=2, max_df=0.6, ngram_range=(1,2),
                     token_pattern=None, tokenizer=lambda doc: doc,
                     preprocessor=lambda doc: doc)
cv_features = cv.fit_transform(norm_papers)
cv_features.shape

(22, 4274)

In [18]:
vocabulary = np.array(cv.get_feature_names())
print('Total Vocabulary Size:', len(vocabulary))

Total Vocabulary Size: 4274


# Topic Models with Latent Semantic Indexing (LSI)
based on SVD

In [19]:
%%time
from sklearn.decomposition import TruncatedSVD

TOTAL_TOPICS = 20

lsi_model = TruncatedSVD(n_components=TOTAL_TOPICS, n_iter=500, random_state=42)
document_topics = lsi_model.fit_transform(cv_features)

Wall time: 1.87 s


In [20]:
topic_terms = lsi_model.components_
topic_terms.shape

(20, 4274)

In [21]:
# We can now generate the topics by reusing some of the code we implemented previously to display the topics and terms.
top_terms = 20
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterm_weights = np.array([topic_terms[row, columns] 
                             for row, columns in list(zip(np.arange(TOTAL_TOPICS), topic_key_term_idxs))])
topic_keyterms = vocabulary[topic_key_term_idxs]
topic_keyterms_weights = list(zip(topic_keyterms, topic_keyterm_weights))
for n in range(TOTAL_TOPICS):
    print('Topic #'+str(n+1)+':')
    print('='*50)
    d1 = []
    d2 = []
    terms, weights = topic_keyterms_weights[n]
    term_weights = sorted([(t, w) for t, w in zip(terms, weights)], 
                          key=lambda row: -abs(row[1]))
    for term, wt in term_weights:
        if wt >= 0:
            d1.append((term, round(wt, 3)))
        else:
            d2.append((term, round(wt, 3)))

    print('Direction 1:', d1)
    print('-'*50)
    print('Direction 2:', d2)
    print('-'*50)
    print()

Topic #1:
Direction 1: [('cell', 0.638), ('firing', 0.238), ('lot', 0.19), ('learning', 0.154), ('synapsis', 0.146), ('probability', 0.141), ('axon', 0.129), ('rule', 0.129), ('cell firing', 0.107), ('ii', 0.106), ('inhibitory', 0.104), ('category', 0.1), ('cortex', 0.1), ('activity', 0.099), ('active', 0.097), ('simulation', 0.092), ('burst', 0.091), ('pitiform', 0.088), ('spatial', 0.082), ('specific', 0.075)]
--------------------------------------------------
Direction 2: []
--------------------------------------------------

Topic #2:
Direction 1: [('classifier', 0.344), ('region', 0.216), ('fig', 0.216), ('training', 0.206), ('decision', 0.204), ('node', 0.195), ('vector', 0.17), ('learning', 0.153), ('class', 0.135), ('reinforcement', 0.132), ('error', 0.127), ('net', 0.118), ('feature', 0.113), ('nat', 0.111), ('two layer', 0.103), ('back', 0.095), ('hopfield', 0.094), ('code', 0.092)]
--------------------------------------------------
Direction 2: [('cell', -0.228), ('firing', 

In [22]:
dt_df = pd.DataFrame(np.round(document_topics, 3), 
                     columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
T1,29.791,17.686,9.807,51.822,7.724,14.526,24.805,7.612,9.976,251.805,...,17.492,12.633,30.351,15.166,55.424,15.331,52.467,14.303,18.021,31.772
T2,43.696,61.084,25.675,-2.635,11.083,24.055,29.458,14.513,10.711,-50.073,...,0.259,34.657,43.218,19.044,-8.296,30.755,0.701,30.218,38.219,63.782
T3,-43.476,16.552,-12.048,3.366,-5.766,-17.867,-23.585,-5.073,-10.172,17.012,...,-0.13,-8.486,-55.902,-10.984,3.744,-8.736,0.651,3.686,-16.388,-82.838
T4,-12.433,53.639,43.674,-1.269,9.454,12.544,-0.695,11.263,-1.428,-3.616,...,1.928,51.688,-25.317,22.918,9.265,-5.565,2.018,-1.911,82.91,-52.155
T5,-40.689,12.299,8.781,4.134,-0.394,-13.443,-22.172,-2.673,-12.388,5.462,...,-0.43,10.616,-74.62,11.367,-0.162,-2.755,-4.685,-2.71,2.736,81.545
T6,-0.835,28.373,30.37,-0.542,6.804,34.193,-5.071,8.78,-0.804,-3.105,...,0.582,-6.504,0.561,69.399,6.388,-8.211,2.024,4.22,-68.081,-9.912
T7,21.931,-3.669,-29.226,6.422,8.731,66.567,1.444,2.008,-2.627,-16.384,...,5.958,-0.044,-37.765,-26.778,34.892,2.303,34.098,3.333,-3.915,-0.346
T8,-8.423,71.967,-18.137,-2.458,-2.587,0.823,-14.198,5.127,1.263,5.007,...,-4.764,14.084,22.545,-51.205,-1.375,-11.849,-6.465,-16.208,-27.622,7.151
T9,-27.11,1.332,26.188,1.382,4.158,-35.0,22.827,-0.774,11.798,-19.307,...,1.328,0.458,8.817,-12.647,26.404,6.237,69.702,12.726,-13.085,1.231
T10,18.167,-12.741,61.707,1.155,5.672,7.305,21.545,3.821,-7.362,7.417,...,0.218,8.023,-20.986,-45.134,-12.423,3.033,-24.413,14.887,-22.709,-4.182


In [24]:
# document_numbers = [13, 250, 500]
document_numbers = [3, 5, 19]

for document_number in document_numbers:
    top_topics = list(dt_df.columns[np.argsort(-np.absolute(dt_df.iloc[document_number].values))[:3]])
    print('Document #'+str(document_number)+':')
    print('Dominant Topics (top 3):', top_topics)
    print('Paper Summary:')
    print(papers[document_number][:500])
    print()
# If you check out the terms in the topics we obtained in the preceding output, they
# actually make sense!

Document #3:
Dominant Topics (top 3): ['T19', 'T1', 'T20']
Paper Summary:
192 
PHASE TRANSITIONS IN NEURAL NETWORKS 
Joshua Chover 
University o Wisconsin, Madison, WI 53706 
Various simulations o cortical subnetworks have evidenced 
something like phase transitions with respect to key parameters. 
We demonstrate that. such transitions must. indeed exist_ in analogous 
ininite array models. For related inite array models classical 
phase transit.ions (which describe steady-state behavior) may not. 
exist., but. there can be distinct. qualitative changes in 
("metastab

Document #5:
Dominant Topics (top 3): ['T7', 'T11', 'T9']
Paper Summary:
223 
'Ensemble' Boltzmann Units 
have Collective Computational Properties 
like those of Hopfield and Tank Neurons 
Mark Derthick and Joe Tebelskis 
Department of Computer Science 
Carnegie-Mellon University 
1 Introduction 
There are three existing connection:,t models in which network states are assigned 
a computational energy. These models Hopf

# Topic Models with Latent Dirichlet Allocation (LDA)

In [25]:
%%time
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components =TOTAL_TOPICS, max_iter=500, max_doc_update_iter=50,
                                      learning_method='online', batch_size=1740, learning_offset=50., 
                                      random_state=42, n_jobs=16)
document_topics = lda_model.fit_transform(cv_features)

Wall time: 1min 8s


In [26]:
# We can then obtain the topic-term matrix and build a dataframe from it to showcase the topics and terms
# in an easy-to-interpret format
topic_terms = lda_model.components_

In [27]:
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterms = vocabulary[topic_key_term_idxs]
topics = [', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics,
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, TOTAL_TOPICS+1)])
topics_df
# Generated topics from our LDA model

  after removing the cwd from sys.path.


Unnamed: 0,Terms per Topic
Topic1,"cell, learning, decision, classifier, fig, region, node, class, map, trial, hold long, aip conference, vector, error, top layer, time step, size, synapsis, complex, two layer"
Topic2,"classifier, region, node, decision, feature, training, class, performance, fig, velocity, cell, map, back propagation, error, two layer, grid, hyperplanes, code, vector, line"
Topic3,"learning, cell, rule, fig, firing, probability, tank, level, vector, active, experiment, energy, rate, temporal, net, node, line, recurrence, brain, element"
Topic4,"learning, reinforcement, symbol, noise, probability, synapse, algorithm, stochastic, search, procedure, trial, rule, activation, fig, control, global, hinton, performance, cell, adaptive"
Topic5,"learning, noise, synapse, fig, level, procedure, electronic, analog, reinforcement, gain, supervised, control, competitive, distribution, run, search, vector, technique, local, stochastic"
Topic6,"brain, fig, level, fiber, specifically, solution, changed, science, real number, chance, known, pattern figure, complicated, advanced research, trained, hippocampus pitiform, correct output, nucleus, classifier, association"
Topic7,"node, classifier, rate, vector, training, cell, temporal, hopfield, fig, synapsis, class, computed, field, region, learning, decision, representation, trial, feature, capacity"
Topic8,"cell, reinforcement, learning, potential, technique, associative, fig, rule, architecture, hopfield, synapsis, average, cortical, well defined, lisp program, synaptic, array, distribution used, optimal, fruitful discussion"
Topic9,"capacity, associative memory, associative, vector, address, kanerva, bound, sequence, error, exponential, rate, cell, element, training, adaptive, radius, location, code, distance, let"
Topic10,"cell, fig, net, vector, training, motion, cycle, classifier, decision, associative memory, feature, fiber, location, assume, decay, back, performance, mellon university, application, field"


In [28]:
pd.options.display.float_format = '{:,.3f}'.format
dt_df = pd.DataFrame(document_topics, 
                     columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
T1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)

max_contrib_topics = dt_df.max(axis=0)
dominant_topics = max_contrib_topics.index
contrib_perc = max_contrib_topics.values
document_numbers = [dt_df[dt_df[t] == max_contrib_topics.loc[t]].index[0]
                       for t in dominant_topics]
documents = [papers[i] for i in document_numbers]

results_df = pd.DataFrame({'Dominant Topic': dominant_topics, 'Contribution %': contrib_perc,
                          'Paper Num': document_numbers, 'Topic': topics_df['Terms per Topic'], 
                          'Paper Name': documents})
results_df
# Viewing each topic and corresponding paper with its maximum contribution

Unnamed: 0,Dominant Topic,Contribution %,Paper Num,Topic,Paper Name
Topic1,T1,7e-05,12,"cell, learning, decision, classifier, fig, region, node, class, map, trial, hold long, aip conference, vector, error, top layer, time step, size, synapsis, complex, two layer","457 \nDISTRIBUTED NEURAL INFORMATION PROCESSING \nIN THE VESTIBULO-OCULAR SYSTEM \nClifford Lau \nOffice of Naval Research Detachment \nPasadena, CA 91106 \nVicente Honrubia* \nUCLA Division of He..."
Topic2,T2,7e-05,12,"classifier, region, node, decision, feature, training, class, performance, fig, velocity, cell, map, back propagation, error, two layer, grid, hyperplanes, code, vector, line","457 \nDISTRIBUTED NEURAL INFORMATION PROCESSING \nIN THE VESTIBULO-OCULAR SYSTEM \nClifford Lau \nOffice of Naval Research Detachment \nPasadena, CA 91106 \nVicente Honrubia* \nUCLA Division of He..."
Topic3,T3,7e-05,12,"learning, cell, rule, fig, firing, probability, tank, level, vector, active, experiment, energy, rate, temporal, net, node, line, recurrence, brain, element","457 \nDISTRIBUTED NEURAL INFORMATION PROCESSING \nIN THE VESTIBULO-OCULAR SYSTEM \nClifford Lau \nOffice of Naval Research Detachment \nPasadena, CA 91106 \nVicente Honrubia* \nUCLA Division of He..."
Topic4,T4,7e-05,12,"learning, reinforcement, symbol, noise, probability, synapse, algorithm, stochastic, search, procedure, trial, rule, activation, fig, control, global, hinton, performance, cell, adaptive","457 \nDISTRIBUTED NEURAL INFORMATION PROCESSING \nIN THE VESTIBULO-OCULAR SYSTEM \nClifford Lau \nOffice of Naval Research Detachment \nPasadena, CA 91106 \nVicente Honrubia* \nUCLA Division of He..."
Topic5,T5,7e-05,12,"learning, noise, synapse, fig, level, procedure, electronic, analog, reinforcement, gain, supervised, control, competitive, distribution, run, search, vector, technique, local, stochastic","457 \nDISTRIBUTED NEURAL INFORMATION PROCESSING \nIN THE VESTIBULO-OCULAR SYSTEM \nClifford Lau \nOffice of Naval Research Detachment \nPasadena, CA 91106 \nVicente Honrubia* \nUCLA Division of He..."
Topic6,T6,7e-05,12,"brain, fig, level, fiber, specifically, solution, changed, science, real number, chance, known, pattern figure, complicated, advanced research, trained, hippocampus pitiform, correct output, nucle...","457 \nDISTRIBUTED NEURAL INFORMATION PROCESSING \nIN THE VESTIBULO-OCULAR SYSTEM \nClifford Lau \nOffice of Naval Research Detachment \nPasadena, CA 91106 \nVicente Honrubia* \nUCLA Division of He..."
Topic7,T7,7e-05,12,"node, classifier, rate, vector, training, cell, temporal, hopfield, fig, synapsis, class, computed, field, region, learning, decision, representation, trial, feature, capacity","457 \nDISTRIBUTED NEURAL INFORMATION PROCESSING \nIN THE VESTIBULO-OCULAR SYSTEM \nClifford Lau \nOffice of Naval Research Detachment \nPasadena, CA 91106 \nVicente Honrubia* \nUCLA Division of He..."
Topic8,T8,7e-05,12,"cell, reinforcement, learning, potential, technique, associative, fig, rule, architecture, hopfield, synapsis, average, cortical, well defined, lisp program, synaptic, array, distribution used, op...","457 \nDISTRIBUTED NEURAL INFORMATION PROCESSING \nIN THE VESTIBULO-OCULAR SYSTEM \nClifford Lau \nOffice of Naval Research Detachment \nPasadena, CA 91106 \nVicente Honrubia* \nUCLA Division of He..."
Topic9,T9,7e-05,12,"capacity, associative memory, associative, vector, address, kanerva, bound, sequence, error, exponential, rate, cell, element, training, adaptive, radius, location, code, distance, let","457 \nDISTRIBUTED NEURAL INFORMATION PROCESSING \nIN THE VESTIBULO-OCULAR SYSTEM \nClifford Lau \nOffice of Naval Research Detachment \nPasadena, CA 91106 \nVicente Honrubia* \nUCLA Division of He..."
Topic10,T10,7e-05,12,"cell, fig, net, vector, training, motion, cycle, classifier, decision, associative memory, feature, fiber, location, assume, decay, back, performance, mellon university, application, field","457 \nDISTRIBUTED NEURAL INFORMATION PROCESSING \nIN THE VESTIBULO-OCULAR SYSTEM \nClifford Lau \nOffice of Naval Research Detachment \nPasadena, CA 91106 \nVicente Honrubia* \nUCLA Division of He..."


# Topic Models with Non-Negative Matrix Factorization (NMF)
Another matrix decomposition technique similar to SVD but operates on non-negative matrices and works well for multivariate data

In [9]:
%%time
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=TOTAL_TOPICS, solver='cd', max_iter=500,
                random_state=42, alpha=.1, l1_ratio=.85)
document_topics = nmf_model.fit_transform(cv_features)

NameError: name 'cv_features' is not defined

In [22]:
# Now that we have our model trained, we can look at the generated topics using the following code
topic_terms = nmf_model.components_
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterms = vocabulary[topic_key_term_idxs]
topics = [', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics,
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, TOTAL_TOPICS+1)])
topics_df
# Generated topics from our NMF model

Unnamed: 0,Terms per Topic
Topic1,"bound, generalization, size, let, optimal, solution, theorem, equation, approximation, class, gradient, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum"
Topic2,"neuron, synaptic, connection, potential, dynamic, synapsis, activity, excitatory, layer, synapse, simulation, inhibitory, delay, biological, equation, state, et, et al, activation, firing"
Topic3,"state, action, policy, step, optimal, reinforcement, transition, reinforcement learning, probability, reward, dynamic, value function, markov, machine, task, agent, finite, iteration, sequence, decision"
Topic4,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, region, visual, representation, transformation, surface"
Topic5,"hidden, layer, net, hidden unit, task, hidden layer, architecture, back, propagation, trained, connection, back propagation, activation, representation, generalization, output unit, neural net, training set, learn, test"
Topic6,"cell, firing, direction, head, rat, response, layer, synaptic, activity, spatial, inhibitory, synapsis, ii, cue, cortex, simulation, lot, active, complex, property"
Topic7,"word, recognition, speech, context, hmm, speaker, speech recognition, character, phoneme, probability, frame, sequence, rate, test, level, acoustic, experiment, letter, segmentation, state"
Topic8,"signal, noise, source, filter, component, frequency, channel, speech, matrix, independent, separation, sound, ica, phase, eeg, blind, auditory, dynamic, delay, fig"
Topic9,"control, controller, trajectory, motor, dynamic, movement, forward, task, feedback, arm, inverse, position, robot, architecture, hand, force, adaptive, change, command, plant"
Topic10,"circuit, chip, current, analog, voltage, vlsi, gate, threshold, transistor, pulse, design, implementation, synapse, bit, digital, device, analog vlsi, element, cmos, pp"


There are no major repetitions of topics and each topic talks about a clear and distinct theme.

The results from the NMF topic model are definitely better than what we obtained from LDA in Scikit-Learn!!!

In [25]:
pd.options.display.float_format = '{:,.3f}'.format
dt_df = pd.DataFrame(document_topics, 
                     columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.head(10)
# Viewing topic dominance per document using the document-topic matrix

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16,T17,T18,T19,T20
0,0.444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004,0.263,0.0,0.0,0.0,0.0,0.0,3.437
1,0.394,0.595,0.463,0.019,0.187,0.037,0.0,0.228,0.13,0.029,0.0,0.254,0.0,0.0,0.0,0.0,0.0,0.106,0.0,0.21
2,0.032,0.619,0.003,0.067,0.016,0.378,0.029,0.027,0.448,0.0,0.075,0.036,0.024,0.184,0.1,0.0,0.126,0.0,0.656,0.277
3,0.0,0.274,0.0,0.102,0.265,1.019,0.0,0.0,0.0,0.0,0.0,0.0,0.218,0.011,0.0,0.004,1.299,0.291,1.268,0.295
4,0.06,0.188,0.682,0.257,0.167,1.402,0.0,0.093,0.0,0.001,0.0,0.02,1.749,0.037,0.0,0.344,0.0,0.0,0.164,0.121
5,0.0,0.383,0.0,0.0,0.679,7.51,0.016,0.0,0.0,0.326,1.146,1.923,0.098,0.0,0.0,0.202,0.0,0.426,0.646,0.641
6,0.0,1.415,0.02,0.0,0.046,0.044,0.0,0.114,0.333,0.04,0.0,0.032,0.124,0.0,0.041,0.041,0.075,0.03,0.0,0.615
7,0.147,0.029,0.0,0.0,0.274,0.008,0.042,0.0,0.045,0.08,0.008,0.025,0.022,0.009,0.0,0.023,0.0,0.007,0.0,0.096
8,0.084,1.76,0.013,0.012,0.0,1.592,0.0,0.0,0.257,0.068,0.273,0.055,0.122,0.0,0.119,0.0,0.0,0.027,0.514,0.353
9,0.395,0.0,0.04,1.258,0.127,0.0,0.0,0.37,0.075,0.076,0.0,0.042,0.0,0.017,0.0,0.053,0.041,0.133,0.427,0.0


In [34]:
# Leveraging the document-topic matrix, we can determine the most relevant paper
# for each topic based on the topic dominance scores by using the following code.
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)

max_score_topics = dt_df.max(axis=0)
dominant_topics = max_score_topics.index
term_score = max_score_topics.values
document_numbers = [dt_df[dt_df[t] == max_score_topics.loc[t]].index[0]
                       for t in dominant_topics]
documents = [papers[i] for i in document_numbers]

results_df = pd.DataFrame({'Dominant Topic': dominant_topics, 'Max Score': term_score,
                          'Paper Num': document_numbers, 'Topic': topics_df['Terms per Topic'], 
                          'Paper Name': documents})
results_df
# Viewing each topic and corresponding paper with its maximum contribution.

Unnamed: 0,Dominant Topic,Max Score,Paper Num,Topic,Paper Name
Topic1,T1,1.64138,991,"bound, generalization, size, let, optimal, solution, theorem, equation, approximation, class, gradient, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum","A Bound on the Error of Cross Validation Using \nthe Approximation and Estimation Rates, with \nConsequences for the Training-Test Split \nMichael Kearns \nAT&T Research \nABSTRACT\n1 INTRODUCTION..."
Topic2,T2,3.58149,383,"neuron, synaptic, connection, potential, dynamic, synapsis, activity, excitatory, layer, synapse, simulation, inhibitory, delay, biological, equation, state, et, et al, activation, firing","Signal Processing by Multiplexing and \nDemultiplexing in Neurons \nDavid C. Tam \nDivision of Neuroscience \nBaylor College of Medicine \nHouston, TX 77030 \ndtamCnext-cns.neusc.bcm.tmc.edu \nAb..."
Topic3,T3,5.83072,1167,"state, action, policy, step, optimal, reinforcement, transition, reinforcement learning, probability, reward, dynamic, value function, markov, machine, task, agent, finite, iteration, sequence, de...","Reinforcement Learning for Mixed \nOpen-loop and Closed-loop Control \nEric A. Hansen, Andrew G. Barto, and Shlomo Zilbersteln \nDepartment of Computer Science \nUniversity of Massachusetts \nAmhe..."
Topic4,T4,3.93349,1731,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, region, visual, representation, transformation, surface",Image representations for facial expression \ncoding \nMarian Stewart Bartlett* \nU.C. San Diego \nmarnisalk. edu \nJavier R. Movellan \nU.C. San Diego \nmovellancogsc. ucsd. edu \nPaul Ekman \n...
Topic5,T5,2.9875,33,"hidden, layer, net, hidden unit, task, hidden layer, architecture, back, propagation, trained, connection, back propagation, activation, representation, generalization, output unit, neural net, tr...","5O5 \nCONNECTING TO THE PAST \nBruce A. MacDonald, Assistant Professor \nKnowledge Sciences Laboratory, Computer Science Department \nThe University of Calgary, 2500 University Drive NW \nCalgary,..."
Topic6,T6,7.51003,5,"cell, firing, direction, head, rat, response, layer, synaptic, activity, spatial, inhibitory, synapsis, ii, cue, cortex, simulation, lot, active, complex, property","317 \nPARTITIONING OF SENSORY DATA BY A COPTICAI, NETWOPK  \nRichard Granger, Jos Ambros-Ingerson, Howard Henry, Gary Lynch \nCenter for the Neurobiology of Learning and Memory \nUniversity of..."
Topic7,T7,4.89525,1318,"word, recognition, speech, context, hmm, speaker, speech recognition, character, phoneme, probability, frame, sequence, rate, test, level, acoustic, experiment, letter, segmentation, state","Comparison of Human and Machine Word \nRecognition \nM. Schenkel \nDept of Electrical Eng. \nUniversity of Sydney \nSydney, NSW 2006, Australia \nschenkel@sedal.usyd.edu.au \nC. Latimer \nDept of ..."
Topic8,T8,3.67982,235,"signal, noise, source, filter, component, frequency, channel, speech, matrix, independent, separation, sound, ica, phase, eeg, blind, auditory, dynamic, delay, fig","232 Sejnowski, Yuhas, Goldstein and Jenkins \nCombining Visual and \nwith a Neural Network \nAcoustic Speech Signals \nImproves Intelligibility \nT.J. Sejnowski \nThe Salk Institute \nand \nDepart..."
Topic9,T9,4.88831,948,"control, controller, trajectory, motor, dynamic, movement, forward, task, feedback, arm, inverse, position, robot, architecture, hand, force, adaptive, change, command, plant","An Integrated Architecture of Adaptive Neural Network \nControl for Dynamic Systems \nLiu Ke '2 Robert L. Tokaf Brian D.McVey z \nCenter for Nonlinear Studies, 2Applied Theoretical Physics Divis..."
Topic10,T10,2.95973,1690,"circuit, chip, current, analog, voltage, vlsi, gate, threshold, transistor, pulse, design, implementation, synapse, bit, digital, device, analog vlsi, element, cmos, pp","Kirchoff Law Markov Fields for Analog \nCircuit Design \nRichard M. Golden * \nRMG Consulting Inc. \n2000 Fresno Road, Plano, Texas 75074 \nRMG CONS UL T@A OL. COM, \nwww. neural-network. corn \nA..."


The outputs depicted in Figure 6-18 clearly show that the NMF model is much better than the LDA model, with each topic being strongly correlated as the central theme of the research paper where it has maximum dominance.

What we have observed is that non-negative matrix factorization works the best even with small corpora, with few documents compared to the other methods. But again, this depends on the type of data you are dealing with.

# Predicting Topics for New Research Papers

In [10]:
import glob
# papers manually downloaded from NIPS 16
# https://papers.nips.cc/book/advances-in-neural-information-processing-systems-29-2016

new_paper_files = glob.glob('./test_data/nips16*.txt')
new_papers = []
for fn in new_paper_files:
    with open(fn, encoding='utf-8', errors='ignore', mode='r+') as f:
        data = f.read()
        new_papers.append(data)
              
print('Total New Papers:', len(new_papers))

Total New Papers: 4


In [11]:
# The next step in the pipeline is to preprocess these documents and extract features
# using the same sequence of steps we followed when building the topic models.
norm_new_papers = normalize_corpus(new_papers)
cv_new_features = cv.transform(norm_new_papers)
cv_new_features.shape

NotFittedError: Vocabulary not fitted or provided

In [52]:
# We can now use our NMF topic model to predict the topics for these new research
# papers using the following code (we predict the top two topics for each paper).
topic_predictions = nmf_model.transform(cv_new_features)
best_topics = [[(topic, round(sc, 3)) 
                    for topic, sc in sorted(enumerate(topic_predictions[i]), 
                                            key=lambda row: -row[1])[:2]] 
                        for i in range(len(topic_predictions))]
best_topics

[[(0, 1.312), (7, 0.966)],
 [(2, 4.121), (0, 0.864)],
 [(3, 2.154), (1, 1.335)],
 [(3, 3.074), (6, 2.19)]]

In [55]:
# Remember that we don’t get proportion of dominance of each topic here, like
# with the LDA model, but we get absolute scores. Let’s view the results in an easy-to-
# understand format.
results_df = pd.DataFrame()
results_df['Papers'] = range(1, len(new_papers)+1)
results_df['Dominant Topics'] = [[topic_num+1 for topic_num, sc in item] for item in best_topics]
res = results_df.set_index(['Papers'])['Dominant Topics'].apply(pd.Series).stack().reset_index(level=1, drop=True)
results_df = pd.DataFrame({'Dominant Topics': res.values}, index=res.index)
results_df['Topic Score'] = [topic_sc for topic_list in 
                                        [[round(sc*100, 2) 
                                              for topic_num, sc in item] 
                                                 for item in best_topics] 
                                    for topic_sc in topic_list]

results_df['Topic Desc'] = [topics_df.iloc[t-1]['Terms per Topic'] for t in results_df['Dominant Topics'].values]
results_df['Paper Desc'] = [new_papers[i-1][:200] for i in results_df.index.values]

results_df
# Predicting topics for new papers with our NMF model
# make sense and our NMF model is working quite well!

Unnamed: 0_level_0,Dominant Topics,Topic Score,Topic Desc,Paper Desc
Papers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,131.2,"bound, generalization, size, let, optimal, solution, theorem, equation, approximation, class, gradient, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum","Correlated-PCA: Principal Components’ Analysis\nwhen Data and Noise are Correlated\nNamrata Vaswani and Han Guo\nIowa State University, Ames, IA, USA\nEmail: {namrata,hanguo}@iastate.edu\nAbstract..."
1,8,96.6,"signal, noise, source, filter, component, frequency, channel, speech, matrix, independent, separation, sound, ica, phase, eeg, blind, auditory, dynamic, delay, fig","Correlated-PCA: Principal Components’ Analysis\nwhen Data and Noise are Correlated\nNamrata Vaswani and Han Guo\nIowa State University, Ames, IA, USA\nEmail: {namrata,hanguo}@iastate.edu\nAbstract..."
2,3,412.1,"state, action, policy, step, optimal, reinforcement, transition, reinforcement learning, probability, reward, dynamic, value function, markov, machine, task, agent, finite, iteration, sequence, de...","PAC Reinforcement Learning with Rich Observations\nAkshay Krishnamurthy\nUniversity of Massachusetts, Amherst\nAmherst, MA, 01003\nakshay@cs.umass.edu\nAlekh Agarwal\nMicrosoft Research\nNew York,..."
2,1,86.4,"bound, generalization, size, let, optimal, solution, theorem, equation, approximation, class, gradient, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum","PAC Reinforcement Learning with Rich Observations\nAkshay Krishnamurthy\nUniversity of Massachusetts, Amherst\nAmherst, MA, 01003\nakshay@cs.umass.edu\nAlekh Agarwal\nMicrosoft Research\nNew York,..."
3,4,215.4,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, region, visual, representation, transformation, surface","Automated scalable segmentation of neurons from\nmultispectral images\nUygar Sümbül\nGrossman Center for the Statistics of Mind\nand Dept. of Statistics, Columbia University\nDouglas Roossien Jr.\..."
3,2,133.5,"neuron, synaptic, connection, potential, dynamic, synapsis, activity, excitatory, layer, synapse, simulation, inhibitory, delay, biological, equation, state, et, et al, activation, firing","Automated scalable segmentation of neurons from\nmultispectral images\nUygar Sümbül\nGrossman Center for the Statistics of Mind\nand Dept. of Statistics, Columbia University\nDouglas Roossien Jr.\..."
4,4,307.4,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, region, visual, representation, transformation, surface","Unsupervised Learning of Spoken Language with\nVisual Context\nDavid Harwath, Antonio Torralba, and James R. Glass\nComputer Science and Artificial Intelligence Laboratory\nMassachusetts Institute..."
4,7,219.0,"word, recognition, speech, context, hmm, speaker, speech recognition, character, phoneme, probability, frame, sequence, rate, test, level, acoustic, experiment, letter, segmentation, state","Unsupervised Learning of Spoken Language with\nVisual Context\nDavid Harwath, Antonio Torralba, and James R. Glass\nComputer Science and Artificial Intelligence Laboratory\nMassachusetts Institute..."


# Persisting Model and Transformers

### This is just for visualizing the topics in the other notebook (since PyLDAViz expands the notebook size)

In [14]:
import dill

with open('nmf_model.pkl', 'wb') as f:
    dill.dump(nmf_model, f)
with open('cv_features.pkl', 'wb') as f:
    dill.dump(cv_features, f)
with open('cv.pkl', 'wb') as f:
    dill.dump(cv, f)