# Load and Pre-process Data

In [1]:
import os
import numpy as np
import pandas as pd

DATA_PATH = 'nipstxt/'
print(os.listdir(DATA_PATH))

['nips01', 'nips04', 'MATLAB_NOTES', 'nips10', 'nips02', 'idx', 'nips11', 'nips03', 'nips07', 'README_yann', 'nips05', 'nips12', 'nips06', 'RAW_DATA_NOTES', 'orig', 'nips00', 'nips08', 'nips09']


In [2]:
folders = ["nips{0:02}".format(i) for i in range(0,13)]
# Read all texts into a list.
papers = []
for folder in folders:
    file_names = os.listdir(DATA_PATH + folder)
    for file_name in file_names:
        with open(DATA_PATH + folder + '/' + file_name, encoding='utf-8', errors='ignore', mode='r+') as f:
            data = f.read()
        papers.append(data)
len(papers)

1740

In [3]:
print(papers[0][:1000])

652 
Scaling Properties of Coarse-Coded Symbol Memories 
Ronald Rosenfeld 
David S. Touretzky 
Computer Science Department 
Carnegie Mellon University 
Pittsburgh, Pennsylvania 15213 
Abstract
Coarse-coded symbol memories have appeared in several neural network 
symbol processing models. In order to determine how these models would scale, one 
must first have some understanding of the mathematics of coarse-coded representa- 
tions. We define the general structure of coarse-coded symbol memories and derive 
mathematical relationships among their essential parameters: memort size, slmbol-set 
size and capacitor. The computed capacity of one of the schemes agrees well with actual 
measurements of the coarse-coded working memory of DCPS, Touretzky and Hinton's 
distributed connectionist production system. 
1 Introduction 
A dstributed representation is a memory scheme in which each entity (concept, symbol) 
is represented by a pattern of activity over many units [3]. If each unit partic

## Basic Text Wrangling

In [39]:
%%time
import nltk

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalize_corpus(papers):
    norm_papers = []
    for paper in papers:
        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))
        if paper_tokens:
            norm_papers.append(paper_tokens)
            
    return norm_papers
    
norm_papers = normalize_corpus(papers)
print(len(norm_papers))

1740
CPU times: user 39.8 s, sys: 159 ms, total: 40 s
Wall time: 40 s


# Text Representation with Feature Engineering

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=20, max_df=0.6, ngram_range=(1,2),
                     token_pattern=None, tokenizer=lambda doc: doc,
                     preprocessor=lambda doc: doc)
cv_features = cv.fit_transform(norm_papers)
cv_features.shape

(1740, 14408)

In [9]:
vocabulary = np.array(cv.get_feature_names())
print('Total Vocabulary Size:', len(vocabulary))

Total Vocabulary Size: 14408


# Topic Models with Latent Semantic Indexing (LSI)

In [10]:
%%time
from sklearn.decomposition import TruncatedSVD

TOTAL_TOPICS = 20

lsi_model = TruncatedSVD(n_components=TOTAL_TOPICS, n_iter=500, random_state=42)
document_topics = lsi_model.fit_transform(cv_features)

CPU times: user 15min 25s, sys: 1min 3s, total: 16min 28s
Wall time: 1min 1s


In [11]:
topic_terms = lsi_model.components_
topic_terms.shape

(20, 14408)

In [12]:
top_terms = 20
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterm_weights = np.array([topic_terms[row, columns] 
                             for row, columns in list(zip(np.arange(TOTAL_TOPICS), topic_key_term_idxs))])
topic_keyterms = vocabulary[topic_key_term_idxs]
topic_keyterms_weights = list(zip(topic_keyterms, topic_keyterm_weights))
for n in range(TOTAL_TOPICS):
    print('Topic #'+str(n+1)+':')
    print('='*50)
    d1 = []
    d2 = []
    terms, weights = topic_keyterms_weights[n]
    term_weights = sorted([(t, w) for t, w in zip(terms, weights)], 
                          key=lambda row: -abs(row[1]))
    for term, wt in term_weights:
        if wt >= 0:
            d1.append((term, round(wt, 3)))
        else:
            d2.append((term, round(wt, 3)))

    print('Direction 1:', d1)
    print('-'*50)
    print('Direction 2:', d2)
    print('-'*50)
    print()

Topic #1:
Direction 1: [('state', 0.221), ('neuron', 0.169), ('image', 0.138), ('cell', 0.13), ('layer', 0.13), ('feature', 0.127), ('probability', 0.121), ('hidden', 0.114), ('distribution', 0.105), ('rate', 0.098), ('signal', 0.095), ('task', 0.093), ('class', 0.092), ('noise', 0.09), ('net', 0.089), ('recognition', 0.089), ('representation', 0.088), ('field', 0.082), ('rule', 0.082), ('step', 0.08)]
--------------------------------------------------
Direction 2: []
--------------------------------------------------

Topic #2:
Direction 1: [('cell', 0.417), ('neuron', 0.39), ('response', 0.175), ('stimulus', 0.155), ('visual', 0.131), ('spike', 0.13), ('firing', 0.117), ('synaptic', 0.11), ('activity', 0.104), ('cortex', 0.097), ('field', 0.085), ('frequency', 0.085), ('direction', 0.082), ('circuit', 0.082), ('motion', 0.082)]
--------------------------------------------------
Direction 2: [('state', -0.289), ('probability', -0.109), ('hidden', -0.098), ('class', -0.091), ('policy',

In [13]:
dt_df = pd.DataFrame(np.round(document_topics, 3), 
                     columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1730,1731,1732,1733,1734,1735,1736,1737,1738,1739
T1,34.982,28.542,25.907,34.163,44.592,108.127,24.209,9.863,43.867,30.974,...,26.53,51.697,22.319,39.656,37.7,42.738,22.219,26.262,32.831,44.796
T2,-5.852,-1.013,18.406,34.295,9.328,124.267,16.7,-1.871,49.251,1.37,...,-13.144,-4.074,-7.387,-17.564,-11.749,41.314,-5.562,-6.212,1.312,41.708
T3,1.41,10.889,3.165,-5.642,10.17,20.19,11.138,-1.272,17.976,-15.393,...,-8.551,-51.893,-3.767,-4.917,-0.491,-5.136,-5.486,-18.248,6.881,7.501
T4,-6.324,-4.124,2.862,12.258,-0.979,-6.059,-6.44,-3.087,-9.035,12.518,...,-9.464,49.427,-6.691,-20.647,-14.674,11.011,-2.055,1.509,2.634,-12.566
T5,3.741,2.986,0.767,-6.996,-7.441,-28.592,11.726,3.455,0.471,-12.691,...,-5.5,-21.342,-7.65,-26.803,-16.937,-20.949,-1.934,-0.972,-19.416,1.184
T6,-3.479,-8.386,2.634,20.382,20.87,124.431,-14.895,0.672,7.873,-13.965,...,3.976,-33.519,-0.643,-4.786,-6.352,16.011,-2.638,1.392,15.578,-18.402
T7,-9.397,-6.295,-3.28,-11.858,-1.963,11.538,-1.289,-3.861,6.792,-5.678,...,12.962,14.565,-2.714,1.883,5.691,-0.636,1.308,2.477,-1.715,29.722
T8,-14.575,-3.0,-1.043,-1.868,-19.29,-35.256,-6.677,0.189,-14.835,3.065,...,-1.531,-11.503,-4.121,22.629,28.148,-17.673,5.222,-11.032,-0.818,32.775
T9,-1.051,-3.557,0.694,-6.498,-2.187,-6.04,-3.979,0.098,-6.426,2.538,...,1.861,-9.045,7.977,-9.376,-6.113,-5.274,2.863,-1.993,-1.686,22.51
T10,1.758,1.996,-8.663,10.108,27.037,28.986,-4.839,1.068,-4.521,4.474,...,-2.462,4.377,-5.835,-3.005,5.621,7.271,-0.363,-15.318,4.018,-1.234


In [14]:
document_numbers = [13, 250, 500]

for document_number in document_numbers:
    top_topics = list(dt_df.columns[np.argsort(-np.absolute(dt_df.iloc[document_number].values))[:3]])
    print('Document #'+str(document_number)+':')
    print('Dominant Topics (top 3):', top_topics)
    print('Paper Summary:')
    print(papers[document_number][:500])
    print()

Document #13:
Dominant Topics (top 3): ['T1', 'T6', 'T4']
Paper Summary:
9 
Stochastic Learning Networks and their Electronic Implementation 
Joshua Alspector*, Robert B. Allen, Victor Hut, and Srinagesh Satyanarayana 
Bell Communications Research, Morristown, NJ 07960 
ABSTRACT
We describe a family of learning algorithms that operate on a recurrent, symmetrically 
connected, neuromorphic network that, like the Boltzmann machine, settles in the 
presence of noise. These networks learn by modifying synaptic connection strengths on 
the basis of correlations seen loca

Document #250:
Dominant Topics (top 3): ['T3', 'T18', 'T4']
Paper Summary:
266 Zemel, Mozer and Hinton 
TRAFFIC: Recognizing Objects Using 
Hierarchical Reference Frame Transformations 
Richard S. Zemel 
Computer Science Dept. 
University of Toronto 
Toronto, ONT M5S 1A4 
Michael C. Mozer 
Computer Science Dept. 
University of Colorado 
Boulder, CO 80309-0430 
Geoffrey E. Hinton 
Computer Science Dept. 
University of Toro

# Topic Models with Latent Dirichlet Allocation (LDA)

In [15]:
%%time
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components =TOTAL_TOPICS, max_iter=500, max_doc_update_iter=50,
                                      learning_method='online', batch_size=1740, learning_offset=50., 
                                      random_state=42, n_jobs=16)
document_topics = lda_model.fit_transform(cv_features)

CPU times: user 13min 14s, sys: 1min 41s, total: 14min 56s
Wall time: 55min 32s


In [16]:
topic_terms = lda_model.components_

In [17]:
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterms = vocabulary[topic_key_term_idxs]
topics = [', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics,
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, TOTAL_TOPICS+1)])
topics_df

Unnamed: 0,Terms per Topic
Topic1,"neuron, circuit, analog, chip, current, voltage, signal, threshold, bit, noise, vlsi, implementation, channel, gate, pulse, processor, element, synapse, parallel, fig"
Topic2,"image, feature, structure, state, layer, neuron, distribution, local, cell, recognition, node, motion, matrix, net, sequence, object, gaussian, hidden, size, line"
Topic3,"neuron, cell, image, class, state, response, rule, feature, rate, probability, representation, hidden, dynamic, et al, frequency, spike, distribution, component, level, recognition"
Topic4,"cell, neuron, response, visual, stimulus, activity, field, spike, motion, synaptic, direction, frequency, signal, cortex, firing, orientation, spatial, eye, rate, map"
Topic5,"image, feature, recognition, layer, hidden, task, object, speech, trained, representation, test, net, classification, classifier, class, level, architecture, experiment, node, rule"
Topic6,"state, dynamic, rule, matrix, recurrent, equation, gradient, net, signal, fixed, sequence, node, source, attractor, hidden, structure, step, fixed point, component, activation"
Topic7,"sequence, chain, region, structure, markov, protein, prediction, hmms, markov model, hidden markov, site, hidden, gene, class, receptor, length, human, distance, mouse, bengio"
Topic8,"memory, word, context, similarity, item, recall, probability, phoneme, short, representation, association, activation, list, serial, short term, address, term memory, store, proximity, phone"
Topic9,"activation, motor, behavior, winner, take, winner take, competitive, active, command, connection, movement, sensory, feedback, wta, net, sensor, body, activation function, level, self"
Topic10,"state, cell, distribution, neuron, probability, control, response, signal, task, rate, layer, architecture, random, hidden, test, image, change, fig, generalization, field"


In [18]:
pd.options.display.float_format = '{:,.3f}'.format
dt_df = pd.DataFrame(document_topics, 
                     columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1730,1731,1732,1733,1734,1735,1736,1737,1738,1739
T1,0.011,0.137,0.017,0.0,0.219,0.034,0.477,0.218,0.12,0.063,...,0.0,0.0,0.0,0.0,0.0,0.0,0.115,0.0,0.0,0.482
T2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T4,0.028,0.0,0.663,0.773,0.262,0.802,0.03,0.0,0.786,0.269,...,0.0,0.107,0.0,0.0,0.0,0.999,0.0,0.0,0.13,0.518
T5,0.227,0.035,0.045,0.212,0.024,0.086,0.086,0.37,0.089,0.402,...,0.21,0.591,0.043,0.007,0.019,0.0,0.431,0.466,0.0,0.0
T6,0.446,0.579,0.0,0.0,0.238,0.022,0.102,0.022,0.0,0.036,...,0.0,0.094,0.0,0.0,0.33,0.0,0.162,0.0,0.0,0.0
T7,0.0,0.0,0.026,0.015,0.008,0.0,0.0,0.0,0.005,0.0,...,0.0,0.0,0.0,0.003,0.002,0.0,0.0,0.0,0.003,0.0
T8,0.062,0.0,0.0,0.0,0.0,0.017,0.041,0.013,0.0,0.0,...,0.04,0.0,0.0,0.0,0.0,0.0,0.014,0.0,0.0,0.0
T9,0.0,0.025,0.113,0.0,0.0,0.021,0.116,0.0,0.0,0.0,...,0.003,0.0,0.0,0.0,0.0,0.0,0.015,0.0,0.0,0.0
T10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)

max_contrib_topics = dt_df.max(axis=0)
dominant_topics = max_contrib_topics.index
contrib_perc = max_contrib_topics.values
document_numbers = [dt_df[dt_df[t] == max_contrib_topics.loc[t]].index[0]
                       for t in dominant_topics]
documents = [papers[i] for i in document_numbers]

results_df = pd.DataFrame({'Dominant Topic': dominant_topics, 'Contribution %': contrib_perc,
                          'Paper Num': document_numbers, 'Topic': topics_df['Terms per Topic'], 
                          'Paper Name': documents})
results_df

Unnamed: 0,Dominant Topic,Contribution %,Paper Num,Topic,Paper Name
Topic1,T1,0.99938,1122,"neuron, circuit, analog, chip, current, voltage, signal, threshold, bit, noise, vlsi, implementation, channel, gate, pulse, processor, element, synapse, parallel, fig","Improved Silicon Cochlea \nusing \nCompatible Lateral Bipolar Transistors \nAndr6 van Schaik, Eric Fragnire, Eric Vittoz \nMANTRA Center for Neuromimetic Systems \nSwiss Federal Institute of Tech..."
Topic2,T2,0.00033,151,"image, feature, structure, state, layer, neuron, distribution, local, cell, recognition, node, motion, matrix, net, sequence, object, gaussian, hidden, size, line",794 \nNEURAL ARCHITECTURE \nValentino Braitenberg \nMax Planck Institute \nFederal Republic of Germany \nABSTRACT\nWhile we are waiting for the ultimate biophysics of cell membranes and synapses \...
Topic3,T3,0.00033,151,"neuron, cell, image, class, state, response, rule, feature, rate, probability, representation, hidden, dynamic, et al, frequency, spike, distribution, component, level, recognition",794 \nNEURAL ARCHITECTURE \nValentino Braitenberg \nMax Planck Institute \nFederal Republic of Germany \nABSTRACT\nWhile we are waiting for the ultimate biophysics of cell membranes and synapses \...
Topic4,T4,0.99947,1735,"cell, neuron, response, visual, stimulus, activity, field, spike, motion, synaptic, direction, frequency, signal, cortex, firing, orientation, spatial, eye, rate, map",Can V1 mechanisms account for \nfigure-ground and medial axis effects? \nZhaoping Li \nGatsby Computational Neuroscience Unit \nUniversity College London \nzhaopinggat shy. ucl. ac. uk \nAbstract...
Topic5,T5,0.99949,177,"image, feature, recognition, layer, hidden, task, object, speech, trained, representation, test, net, classification, classifier, class, level, architecture, experiment, node, rule","215 \nConsonant Recognition by Modular Construction of \nLarge Phonemic Time-Delay Neural Networks \nAlex Waibel \nCarnegie-Mellon University \nPittsburgh, PA 15213, \nATR Interpreting Telephony R..."
Topic6,T6,0.99684,1128,"state, dynamic, rule, matrix, recurrent, equation, gradient, net, signal, fixed, sequence, node, source, attractor, hidden, structure, step, fixed point, component, activation","Finite State Automata that Recurrent \nCascade-Correlation Cannot Represent \nStefan C. Kremer \nDepartment of Computing Science \nUniversity of Alberta \nEdmonton, Alberta, CANADA T6H 5B5 \nAbstr..."
Topic7,T7,0.99956,283,"sequence, chain, region, structure, markov, protein, prediction, hmms, markov model, hidden markov, site, hidden, gene, class, receptor, length, human, distance, mouse, bengio","A Neural Network to Detect \nHomologies in Proteins \nYoshua Bengio \nSchool of Computer Science \nMcGill University \nMontreal, Canada H3A 2A7 \nSamy Bengio \nDepartement d'Informatique \nUnivers..."
Topic8,T8,0.98167,892,"memory, word, context, similarity, item, recall, probability, phoneme, short, representation, association, activation, list, serial, short term, address, term memory, store, proximity, phone","A solvable connectionist model of \nimmediate recall of ordered lists \nNell Burgess \nDepartment of Anatomy, University College London \nLondon WCiE 6BT, England \n(e-mail: n .burgessucl. ac. uk..."
Topic9,T9,0.99929,227,"activation, motor, behavior, winner, take, winner take, competitive, active, command, connection, movement, sensory, feedback, wta, net, sensor, body, activation function, level, self","44 Beer and Chiei \nNeural \nImplementation of Motivated Behavior: \nFeeding in an Artificial Insect \nRandall D. Beer t,2 and Hillel J. Chiel 2 \nDepartments of t Computer Engineering and Science..."
Topic10,T10,0.00033,151,"state, cell, distribution, neuron, probability, control, response, signal, task, rate, layer, architecture, random, hidden, test, image, change, fig, generalization, field",794 \nNEURAL ARCHITECTURE \nValentino Braitenberg \nMax Planck Institute \nFederal Republic of Germany \nABSTRACT\nWhile we are waiting for the ultimate biophysics of cell membranes and synapses \...


# Topic Models with Non-Negative Matrix Factorization (NMF)

In [20]:
%%time
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=TOTAL_TOPICS, solver='cd', max_iter=500,
                random_state=42, alpha=.1, l1_ratio=.85)
document_topics = nmf_model.fit_transform(cv_features)

CPU times: user 11min 39s, sys: 47.5 s, total: 12min 26s
Wall time: 46.7 s


In [22]:
topic_terms = nmf_model.components_
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterms = vocabulary[topic_key_term_idxs]
topics = [', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics,
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, TOTAL_TOPICS+1)])
topics_df

Unnamed: 0,Terms per Topic
Topic1,"bound, generalization, size, let, optimal, solution, theorem, equation, approximation, class, gradient, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum"
Topic2,"neuron, synaptic, connection, potential, dynamic, synapsis, activity, excitatory, layer, synapse, simulation, inhibitory, delay, biological, equation, state, et, et al, activation, firing"
Topic3,"state, action, policy, step, optimal, reinforcement, transition, reinforcement learning, probability, reward, dynamic, value function, markov, machine, task, agent, finite, iteration, sequence, decision"
Topic4,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, region, visual, representation, transformation, surface"
Topic5,"hidden, layer, net, hidden unit, task, hidden layer, architecture, back, propagation, trained, connection, back propagation, activation, representation, generalization, output unit, neural net, training set, learn, test"
Topic6,"cell, firing, direction, head, rat, response, layer, synaptic, activity, spatial, inhibitory, synapsis, ii, cue, cortex, simulation, lot, active, complex, property"
Topic7,"word, recognition, speech, context, hmm, speaker, speech recognition, character, phoneme, probability, frame, sequence, rate, test, level, acoustic, experiment, letter, segmentation, state"
Topic8,"signal, noise, source, filter, component, frequency, channel, speech, matrix, independent, separation, sound, ica, phase, eeg, blind, auditory, dynamic, delay, fig"
Topic9,"control, controller, trajectory, motor, dynamic, movement, forward, task, feedback, arm, inverse, position, robot, architecture, hand, force, adaptive, change, command, plant"
Topic10,"circuit, chip, current, analog, voltage, vlsi, gate, threshold, transistor, pulse, design, implementation, synapse, bit, digital, device, analog vlsi, element, cmos, pp"


In [25]:
pd.options.display.float_format = '{:,.3f}'.format
dt_df = pd.DataFrame(document_topics, 
                     columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.head(10)

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16,T17,T18,T19,T20
0,0.444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004,0.263,0.0,0.0,0.0,0.0,0.0,3.437
1,0.394,0.595,0.463,0.019,0.187,0.037,0.0,0.228,0.13,0.029,0.0,0.254,0.0,0.0,0.0,0.0,0.0,0.106,0.0,0.21
2,0.032,0.619,0.003,0.067,0.016,0.378,0.029,0.027,0.448,0.0,0.075,0.036,0.024,0.184,0.1,0.0,0.126,0.0,0.656,0.277
3,0.0,0.274,0.0,0.102,0.265,1.019,0.0,0.0,0.0,0.0,0.0,0.0,0.218,0.011,0.0,0.004,1.299,0.291,1.268,0.295
4,0.06,0.188,0.682,0.257,0.167,1.402,0.0,0.093,0.0,0.001,0.0,0.02,1.749,0.037,0.0,0.344,0.0,0.0,0.164,0.121
5,0.0,0.383,0.0,0.0,0.679,7.51,0.016,0.0,0.0,0.326,1.146,1.923,0.098,0.0,0.0,0.202,0.0,0.426,0.646,0.641
6,0.0,1.415,0.02,0.0,0.046,0.044,0.0,0.114,0.333,0.04,0.0,0.032,0.124,0.0,0.041,0.041,0.075,0.03,0.0,0.615
7,0.147,0.029,0.0,0.0,0.274,0.008,0.042,0.0,0.045,0.08,0.008,0.025,0.022,0.009,0.0,0.023,0.0,0.007,0.0,0.096
8,0.084,1.76,0.013,0.012,0.0,1.592,0.0,0.0,0.257,0.068,0.273,0.055,0.122,0.0,0.119,0.0,0.0,0.027,0.514,0.353
9,0.395,0.0,0.04,1.258,0.127,0.0,0.0,0.37,0.075,0.076,0.0,0.042,0.0,0.017,0.0,0.053,0.041,0.133,0.427,0.0


In [34]:
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)

max_score_topics = dt_df.max(axis=0)
dominant_topics = max_score_topics.index
term_score = max_score_topics.values
document_numbers = [dt_df[dt_df[t] == max_score_topics.loc[t]].index[0]
                       for t in dominant_topics]
documents = [papers[i] for i in document_numbers]

results_df = pd.DataFrame({'Dominant Topic': dominant_topics, 'Max Score': term_score,
                          'Paper Num': document_numbers, 'Topic': topics_df['Terms per Topic'], 
                          'Paper Name': documents})
results_df

Unnamed: 0,Dominant Topic,Max Score,Paper Num,Topic,Paper Name
Topic1,T1,1.64138,991,"bound, generalization, size, let, optimal, solution, theorem, equation, approximation, class, gradient, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum","A Bound on the Error of Cross Validation Using \nthe Approximation and Estimation Rates, with \nConsequences for the Training-Test Split \nMichael Kearns \nAT&T Research \nABSTRACT\n1 INTRODUCTION..."
Topic2,T2,3.58149,383,"neuron, synaptic, connection, potential, dynamic, synapsis, activity, excitatory, layer, synapse, simulation, inhibitory, delay, biological, equation, state, et, et al, activation, firing","Signal Processing by Multiplexing and \nDemultiplexing in Neurons \nDavid C. Tam \nDivision of Neuroscience \nBaylor College of Medicine \nHouston, TX 77030 \ndtamCnext-cns.neusc.bcm.tmc.edu \nAb..."
Topic3,T3,5.83072,1167,"state, action, policy, step, optimal, reinforcement, transition, reinforcement learning, probability, reward, dynamic, value function, markov, machine, task, agent, finite, iteration, sequence, de...","Reinforcement Learning for Mixed \nOpen-loop and Closed-loop Control \nEric A. Hansen, Andrew G. Barto, and Shlomo Zilbersteln \nDepartment of Computer Science \nUniversity of Massachusetts \nAmhe..."
Topic4,T4,3.93349,1731,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, region, visual, representation, transformation, surface",Image representations for facial expression \ncoding \nMarian Stewart Bartlett* \nU.C. San Diego \nmarnisalk. edu \nJavier R. Movellan \nU.C. San Diego \nmovellancogsc. ucsd. edu \nPaul Ekman \n...
Topic5,T5,2.9875,33,"hidden, layer, net, hidden unit, task, hidden layer, architecture, back, propagation, trained, connection, back propagation, activation, representation, generalization, output unit, neural net, tr...","5O5 \nCONNECTING TO THE PAST \nBruce A. MacDonald, Assistant Professor \nKnowledge Sciences Laboratory, Computer Science Department \nThe University of Calgary, 2500 University Drive NW \nCalgary,..."
Topic6,T6,7.51003,5,"cell, firing, direction, head, rat, response, layer, synaptic, activity, spatial, inhibitory, synapsis, ii, cue, cortex, simulation, lot, active, complex, property","317 \nPARTITIONING OF SENSORY DATA BY A COPTICAI, NETWOPK  \nRichard Granger, Jos Ambros-Ingerson, Howard Henry, Gary Lynch \nCenter for the Neurobiology of Learning and Memory \nUniversity of..."
Topic7,T7,4.89525,1318,"word, recognition, speech, context, hmm, speaker, speech recognition, character, phoneme, probability, frame, sequence, rate, test, level, acoustic, experiment, letter, segmentation, state","Comparison of Human and Machine Word \nRecognition \nM. Schenkel \nDept of Electrical Eng. \nUniversity of Sydney \nSydney, NSW 2006, Australia \nschenkel@sedal.usyd.edu.au \nC. Latimer \nDept of ..."
Topic8,T8,3.67982,235,"signal, noise, source, filter, component, frequency, channel, speech, matrix, independent, separation, sound, ica, phase, eeg, blind, auditory, dynamic, delay, fig","232 Sejnowski, Yuhas, Goldstein and Jenkins \nCombining Visual and \nwith a Neural Network \nAcoustic Speech Signals \nImproves Intelligibility \nT.J. Sejnowski \nThe Salk Institute \nand \nDepart..."
Topic9,T9,4.88831,948,"control, controller, trajectory, motor, dynamic, movement, forward, task, feedback, arm, inverse, position, robot, architecture, hand, force, adaptive, change, command, plant","An Integrated Architecture of Adaptive Neural Network \nControl for Dynamic Systems \nLiu Ke '2 Robert L. Tokaf Brian D.McVey z \nCenter for Nonlinear Studies, 2Applied Theoretical Physics Divis..."
Topic10,T10,2.95973,1690,"circuit, chip, current, analog, voltage, vlsi, gate, threshold, transistor, pulse, design, implementation, synapse, bit, digital, device, analog vlsi, element, cmos, pp","Kirchoff Law Markov Fields for Analog \nCircuit Design \nRichard M. Golden * \nRMG Consulting Inc. \n2000 Fresno Road, Plano, Texas 75074 \nRMG CONS UL T@A OL. COM, \nwww. neural-network. corn \nA..."


# Predicting Topics for New Research Papers

In [37]:
import glob
# papers manually downloaded from NIPS 16
# https://papers.nips.cc/book/advances-in-neural-information-processing-systems-29-2016

new_paper_files = glob.glob('./test_data/nips16*.txt')
new_papers = []
for fn in new_paper_files:
    with open(fn, encoding='utf-8', errors='ignore', mode='r+') as f:
        data = f.read()
        new_papers.append(data)
              
print('Total New Papers:', len(new_papers))

Total New Papers: 4


In [40]:
norm_new_papers = normalize_corpus(new_papers)
cv_new_features = cv.transform(norm_new_papers)
cv_new_features.shape

(4, 14408)

In [52]:
topic_predictions = nmf_model.transform(cv_new_features)
best_topics = [[(topic, round(sc, 3)) 
                    for topic, sc in sorted(enumerate(topic_predictions[i]), 
                                            key=lambda row: -row[1])[:2]] 
                        for i in range(len(topic_predictions))]
best_topics

[[(0, 1.312), (7, 0.966)],
 [(2, 4.121), (0, 0.864)],
 [(3, 2.154), (1, 1.335)],
 [(3, 3.074), (6, 2.19)]]

In [55]:
results_df = pd.DataFrame()
results_df['Papers'] = range(1, len(new_papers)+1)
results_df['Dominant Topics'] = [[topic_num+1 for topic_num, sc in item] for item in best_topics]
res = results_df.set_index(['Papers'])['Dominant Topics'].apply(pd.Series).stack().reset_index(level=1, drop=True)
results_df = pd.DataFrame({'Dominant Topics': res.values}, index=res.index)
results_df['Topic Score'] = [topic_sc for topic_list in 
                                        [[round(sc*100, 2) 
                                              for topic_num, sc in item] 
                                                 for item in best_topics] 
                                    for topic_sc in topic_list]

results_df['Topic Desc'] = [topics_df.iloc[t-1]['Terms per Topic'] for t in results_df['Dominant Topics'].values]
results_df['Paper Desc'] = [new_papers[i-1][:200] for i in results_df.index.values]

results_df

Unnamed: 0_level_0,Dominant Topics,Topic Score,Topic Desc,Paper Desc
Papers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,131.2,"bound, generalization, size, let, optimal, solution, theorem, equation, approximation, class, gradient, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum","Correlated-PCA: Principal Components’ Analysis\nwhen Data and Noise are Correlated\nNamrata Vaswani and Han Guo\nIowa State University, Ames, IA, USA\nEmail: {namrata,hanguo}@iastate.edu\nAbstract..."
1,8,96.6,"signal, noise, source, filter, component, frequency, channel, speech, matrix, independent, separation, sound, ica, phase, eeg, blind, auditory, dynamic, delay, fig","Correlated-PCA: Principal Components’ Analysis\nwhen Data and Noise are Correlated\nNamrata Vaswani and Han Guo\nIowa State University, Ames, IA, USA\nEmail: {namrata,hanguo}@iastate.edu\nAbstract..."
2,3,412.1,"state, action, policy, step, optimal, reinforcement, transition, reinforcement learning, probability, reward, dynamic, value function, markov, machine, task, agent, finite, iteration, sequence, de...","PAC Reinforcement Learning with Rich Observations\nAkshay Krishnamurthy\nUniversity of Massachusetts, Amherst\nAmherst, MA, 01003\nakshay@cs.umass.edu\nAlekh Agarwal\nMicrosoft Research\nNew York,..."
2,1,86.4,"bound, generalization, size, let, optimal, solution, theorem, equation, approximation, class, gradient, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum","PAC Reinforcement Learning with Rich Observations\nAkshay Krishnamurthy\nUniversity of Massachusetts, Amherst\nAmherst, MA, 01003\nakshay@cs.umass.edu\nAlekh Agarwal\nMicrosoft Research\nNew York,..."
3,4,215.4,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, region, visual, representation, transformation, surface","Automated scalable segmentation of neurons from\nmultispectral images\nUygar Sümbül\nGrossman Center for the Statistics of Mind\nand Dept. of Statistics, Columbia University\nDouglas Roossien Jr.\..."
3,2,133.5,"neuron, synaptic, connection, potential, dynamic, synapsis, activity, excitatory, layer, synapse, simulation, inhibitory, delay, biological, equation, state, et, et al, activation, firing","Automated scalable segmentation of neurons from\nmultispectral images\nUygar Sümbül\nGrossman Center for the Statistics of Mind\nand Dept. of Statistics, Columbia University\nDouglas Roossien Jr.\..."
4,4,307.4,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, region, visual, representation, transformation, surface","Unsupervised Learning of Spoken Language with\nVisual Context\nDavid Harwath, Antonio Torralba, and James R. Glass\nComputer Science and Artificial Intelligence Laboratory\nMassachusetts Institute..."
4,7,219.0,"word, recognition, speech, context, hmm, speaker, speech recognition, character, phoneme, probability, frame, sequence, rate, test, level, acoustic, experiment, letter, segmentation, state","Unsupervised Learning of Spoken Language with\nVisual Context\nDavid Harwath, Antonio Torralba, and James R. Glass\nComputer Science and Artificial Intelligence Laboratory\nMassachusetts Institute..."


# Persisting Model and Transformers

### This is just for visualizing the topics in the other notebook (since PyLDAViz expands the notebook size)

In [67]:
import dill

with open('nmf_model.pkl', 'wb') as f:
    dill.dump(nmf_model, f)
with open('cv_features.pkl', 'wb') as f:
    dill.dump(cv_features, f)
with open('cv.pkl', 'wb') as f:
    dill.dump(cv, f)