# Load and Pre-process Data

In [1]:
!wget https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz
!tar -xzf nips12raw_str602.tgz

--2020-08-22 07:35:02--  https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz
Resolving cs.nyu.edu (cs.nyu.edu)... 128.122.49.30
Connecting to cs.nyu.edu (cs.nyu.edu)|128.122.49.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12851423 (12M) [application/x-gzip]
Saving to: ‘nips12raw_str602.tgz’


2020-08-22 07:35:03 (14.6 MB/s) - ‘nips12raw_str602.tgz’ saved [12851423/12851423]



In [2]:
import os
import numpy as np
import pandas as pd

DATA_PATH = 'nipstxt/'
print(os.listdir(DATA_PATH))

['nips08', 'idx', 'nips09', 'MATLAB_NOTES', 'nips04', 'nips07', 'nips11', 'nips03', 'nips05', 'orig', 'nips06', 'nips02', 'README_yann', 'nips00', 'nips01', 'RAW_DATA_NOTES', 'nips12', 'nips10']


In [3]:
folders = ["nips{0:02}".format(i) for i in range(0,13)]
# Read all texts into a list.
papers = []
for folder in folders:
    file_names = os.listdir(DATA_PATH + folder)
    for file_name in file_names:
        with open(DATA_PATH + folder + '/' + file_name, encoding='utf-8', errors='ignore', mode='r+') as f:
            data = f.read()
        papers.append(data)
len(papers)

1740

In [4]:
print(papers[0][:1000])

573 
BIT - SERIAL NEURAL NETWORKS 
Alan F. Murray, Anthony V. W. Smith and Zoe F. Buffer. 
Department of Electrical Engineering, University of Edinburgh, 
The King's Buildings, Mayfield Road, Edinburgh, 
Scoff and, EH9 3JL. 
ABSTRACT 
A bit - serial VLSI neural network is described from an initial architecture for a 
synapse array through to silicon layout and board design. The issues surrounding bit 
- serial computation, and analog/digital arithmetic are discussed and the parallel 
development of a hybrid analog/digital neural network is outlined. Learning and 
recall capabilities are reported for the bit - serial network along with a projected 
specification for a 64 - neuron, bit - serial board operating at 20 MHz. This tech- 
nique is extended to a 256 (2562 synapses) network with an update time of 3ms, 
using a "paging" technique to time - multiplex calculations through the synapse 
array. 
1. INTRODUCTION 
The functions a synthetic neural network may aspire to mimic are the abil

## Basic Text Wrangling

In [6]:
%%time
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
  

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalize_corpus(papers):
    norm_papers = []
    for paper in papers:
        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))
        if paper_tokens:
            norm_papers.append(paper_tokens)
            
    return norm_papers
    
norm_papers = normalize_corpus(papers)
print(len(norm_papers))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
1740
CPU times: user 28.7 s, sys: 341 ms, total: 29.1 s
Wall time: 29.4 s


In [12]:
print(norm_papers[0])

['bit', 'serial', 'neural', 'network', 'alan', 'murray', 'anthony', 'smith', 'zoe', 'buffer', 'department', 'electrical', 'engineering', 'university', 'edinburgh', 'king', 'building', 'mayfield', 'road', 'edinburgh', 'scoff', 'eh9', '3jl', 'abstract', 'bit', 'serial', 'vlsi', 'neural', 'network', 'described', 'initial', 'architecture', 'synapse', 'array', 'silicon', 'layout', 'board', 'design', 'issue', 'surrounding', 'bit', 'serial', 'computation', 'analog', 'digital', 'arithmetic', 'discussed', 'parallel', 'development', 'hybrid', 'analog', 'digital', 'neural', 'network', 'outlined', 'learning', 'recall', 'capability', 'reported', 'bit', 'serial', 'network', 'along', 'projected', 'specification', 'neuron', 'bit', 'serial', 'board', 'operating', 'mhz', 'tech', 'nique', 'extended', 'synapsis', 'network', 'update', 'time', '3ms', 'using', 'paging', 'technique', 'time', 'multiplex', 'calculation', 'synapse', 'array', 'introduction', 'function', 'synthetic', 'neural', 'network', 'may', 'a

# Text Representation with Feature Engineering

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=20, max_df=0.6, ngram_range=(1,2),
                     token_pattern=None, tokenizer=lambda doc: doc,
                     preprocessor=lambda doc: doc)
cv_features = cv.fit_transform(norm_papers)
cv_features.shape

(1740, 14408)

In [8]:
vocabulary = np.array(cv.get_feature_names())
print('Total Vocabulary Size:', len(vocabulary))

Total Vocabulary Size: 14408


In [14]:
vocabulary

array(['0i', '0j', '0o', ..., 'zt', 'zx', 'zz'], dtype='<U28')

# Topic Models with Latent Semantic Indexing (LSI)

In [15]:
%%time
from sklearn.decomposition import TruncatedSVD

TOTAL_TOPICS = 20

lsi_model = TruncatedSVD(n_components=TOTAL_TOPICS, n_iter=500, random_state=42)
document_topics = lsi_model.fit_transform(cv_features)

CPU times: user 1min 27s, sys: 1min 3s, total: 2min 30s
Wall time: 1min 16s


In [24]:
document_topics.shape

(1740, 20)

In [16]:
topic_terms = lsi_model.components_
topic_terms.shape

(20, 14408)

In [17]:
topic_terms

array([[ 1.12927439e-03,  6.09288836e-04,  1.51008884e-03, ...,
         4.34334087e-03,  1.01937083e-03,  5.07621376e-04],
       [-3.00632887e-04, -4.76607711e-04,  1.12743502e-03, ...,
        -6.32378629e-03, -1.05511388e-03, -2.69528891e-04],
       [-1.74476092e-04, -8.63521835e-05,  7.45203760e-04, ...,
         4.88534437e-03, -2.09449111e-04, -6.79315314e-05],
       ...,
       [-1.00841993e-03,  2.29490496e-04,  2.33954453e-04, ...,
         3.84500572e-03, -4.62599365e-04,  1.14622887e-03],
       [ 5.36057499e-05,  6.68163037e-04,  9.11247662e-04, ...,
        -3.91911452e-03,  2.77760569e-04, -7.74824840e-04],
       [-1.80776507e-03,  5.75130384e-04,  1.94317046e-04, ...,
        -1.05594410e-03, -6.62353527e-05, -4.32904953e-04]])

In [18]:
top_terms = 20
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterm_weights = np.array([topic_terms[row, columns] 
                             for row, columns in list(zip(np.arange(TOTAL_TOPICS), topic_key_term_idxs))])
topic_keyterms = vocabulary[topic_key_term_idxs]
topic_keyterms_weights = list(zip(topic_keyterms, topic_keyterm_weights))
for n in range(TOTAL_TOPICS):
    print('Topic #'+str(n+1)+':')
    print('='*50)
    d1 = []
    d2 = []
    terms, weights = topic_keyterms_weights[n]
    term_weights = sorted([(t, w) for t, w in zip(terms, weights)], 
                          key=lambda row: -abs(row[1]))
    for term, wt in term_weights:
        if wt >= 0:
            d1.append((term, round(wt, 3)))
        else:
            d2.append((term, round(wt, 3)))

    print('Direction 1:', d1)
    print('-'*50)
    print('Direction 2:', d2)
    print('-'*50)
    print()

Topic #1:
Direction 1: [('state', 0.221), ('neuron', 0.169), ('image', 0.138), ('cell', 0.13), ('layer', 0.13), ('feature', 0.127), ('probability', 0.121), ('hidden', 0.114), ('distribution', 0.105), ('rate', 0.098), ('signal', 0.095), ('task', 0.093), ('class', 0.092), ('noise', 0.09), ('net', 0.089), ('recognition', 0.089), ('representation', 0.088), ('field', 0.082), ('rule', 0.082), ('step', 0.08)]
--------------------------------------------------
Direction 2: []
--------------------------------------------------

Topic #2:
Direction 1: [('cell', 0.417), ('neuron', 0.39), ('response', 0.175), ('stimulus', 0.155), ('visual', 0.131), ('spike', 0.13), ('firing', 0.117), ('synaptic', 0.11), ('activity', 0.104), ('cortex', 0.097), ('field', 0.085), ('frequency', 0.085), ('direction', 0.082), ('circuit', 0.082), ('motion', 0.082)]
--------------------------------------------------
Direction 2: [('state', -0.289), ('probability', -0.109), ('hidden', -0.098), ('class', -0.091), ('policy',

In [22]:
document_topics.shape

(1740, 20)

In [19]:
dt_df = pd.DataFrame(np.round(document_topics, 3), 
                     columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1700,1701,1702,1703,1704,1705,1706,1707,1708,1709,1710,1711,1712,1713,1714,1715,1716,1717,1718,1719,1720,1721,1722,1723,1724,1725,1726,1727,1728,1729,1730,1731,1732,1733,1734,1735,1736,1737,1738,1739
T1,49.207,19.875,32.48,41.207,39.148,14.773,17.331,39.717,38.008,30.219,22.803,40.1,28.272,44.592,28.231,62.958,56.191,20.943,22.532,38.178,46.168,26.435,25.845,30.974,24.209,16.437,51.298,21.56,57.723,39.235,42.714,29.331,33.197,32.52,25.488,37.202,21.368,44.822,26.408,46.477,...,34.768,28.628,40.438,26.53,34.684,26.262,27.079,48.741,24.224,22.219,25.199,34.131,25.437,25.679,26.435,25.253,31.16,28.026,27.053,23.603,37.321,26.683,37.965,29.3,36.184,35.729,34.049,25.148,28.648,29.186,24.986,32.216,37.454,42.738,48.313,39.854,46.058,25.953,43.85,26.103
T2,5.732,6.621,-3.491,28.411,-7.606,1.322,-6.047,15.803,0.648,30.913,-1.964,16.541,24.387,9.328,-0.721,-14.629,-19.533,-5.107,3.671,12.68,-5.992,25.82,12.87,1.37,16.7,2.358,5.869,6.716,-0.892,-10.072,17.77,-2.075,8.359,30.041,-5.832,-2.052,2.552,6.649,-3.343,-8.457,...,-11.836,-10.822,-21.47,-13.144,1.289,-6.212,-6.517,-2.79,7.12,-5.562,-11.998,-11.416,-13.793,-7.744,-16.619,-14.658,-17.637,-10.626,-13.162,-9.169,39.284,3.296,-14.914,-16.829,-21.41,-4.67,-8.576,-4.536,-12.314,-9.614,16.211,-13.841,38.621,41.314,36.324,-25.001,-11.195,1.214,-31.794,-6.085
T3,42.819,6.901,-12.635,7.661,-13.323,5.769,-5.044,4.083,0.832,4.965,3.363,15.912,11.841,10.17,16.053,-37.653,14.272,-4.495,2.896,-14.701,-16.382,5.257,15.013,-15.393,11.138,5.168,24.591,7.3,-5.349,18.533,8.193,0.064,-6.2,15.089,3.284,-2.749,6.702,18.312,-10.793,12.215,...,-2.838,3.483,33.258,-8.551,-9.955,-18.248,-9.752,6.439,-7.312,-5.486,-2.793,-18.425,-9.863,16.584,-4.86,-6.139,17.812,-8.474,5.11,0.062,22.271,20.439,-11.746,-13.059,34.469,-24.388,-23.708,-0.678,-8.209,-18.183,20.085,-14.064,13.244,-5.136,24.935,-18.122,-45.472,-3.542,28.409,4.66
T4,5.16,-12.437,8.834,-28.256,-16.42,-4.901,-6.236,-13.368,-1.92,-2.253,-8.412,-11.555,-1.929,-0.979,15.456,-17.904,0.086,-9.058,-3.36,25.229,5.795,-4.165,-10.029,12.518,-6.44,-5.587,-13.522,1.774,-20.905,-5.095,-9.513,-7.14,0.36,12.833,-6.972,-2.12,-3.574,-13.303,-2.24,-1.709,...,-15.593,-8.2,22.846,-9.464,5.659,1.509,-3.635,-9.064,5.33,-2.055,-12.988,-5.781,-14.134,8.284,-20.882,-21.136,-4.458,-12.279,-8.108,-12.926,-19.32,-5.588,-16.644,-16.424,16.593,7.786,2.464,-9.012,-16.533,-1.926,-5.853,-13.535,-20.935,11.011,-3.209,-27.714,23.541,-5.38,2.059,-8.33
T5,22.56,0.486,-2.55,7.78,8.055,0.574,7.831,7.242,13.015,4.269,-6.386,14.478,-13.875,-7.441,-4.269,49.664,50.193,7.615,-3.325,-2.389,15.193,-7.91,6.769,-12.691,11.726,-1.485,25.487,7.323,40.715,-0.443,22.028,4.695,17.794,-8.494,6.618,15.4,1.33,8.986,22.351,-10.586,...,-15.64,-12.866,-15.625,-5.5,5.527,-0.972,-14.202,1.559,-4.47,-1.934,-7.743,-12.738,-12.728,-1.335,-24.384,-20.838,-8.551,-19.305,-15.171,-10.492,7.84,0.768,-8.073,-16.38,-1.06,-7.153,-28.593,-6.433,-7.078,-7.298,9.197,-11.809,11.182,-20.949,11.896,2.009,-20.47,-7.368,-24.891,-4.438
T6,-25.837,-12.345,-5.989,-24.211,-2.699,-10.863,4.683,-20.493,-5.612,27.1,-7.55,-28.463,28.047,20.87,3.513,35.128,12.312,3.771,-0.05,-35.468,8.457,18.177,-20.484,-13.965,-14.895,-5.476,-22.503,7.397,0.16,-12.373,-16.731,-8.585,-14.401,25.439,1.415,4.82,-8.309,-20.833,6.13,-8.04,...,4.404,-3.721,-0.151,3.976,8.203,1.392,-7.294,-8.859,-3.545,-2.638,0.812,-4.954,0.285,-2.257,-2.801,-0.415,-5.297,-2.089,-2.584,-4.244,-18.001,-15.709,9.262,3.473,6.211,5.102,-15.132,-4.033,1.994,0.723,-21.229,-1.106,-12.885,16.011,-13.509,13.687,-19.396,0.502,-0.492,-1.667
T7,5.353,2.338,1.101,32.674,7.759,-2.904,-6.652,6.068,-18.856,-4.608,-6.982,-7.744,7.428,-1.963,1.173,-6.262,-28.829,-8.912,-4.967,2.072,-15.607,6.338,-0.847,-5.678,-1.289,-1.637,-4.643,0.041,-33.546,-14.848,-13.66,-5.385,0.655,-1.665,-13.522,5.655,-4.756,2.279,11.448,-12.092,...,-11.088,2.631,0.971,12.962,-12.879,2.477,0.048,-2.994,3.806,1.308,-2.004,4.965,6.087,-6.449,4.637,-1.775,8.229,-3.489,1.311,-8.116,10.088,8.288,-0.301,15.72,2.442,20.456,7.893,-3.427,0.013,12.992,5.444,7.125,1.488,-0.636,10.004,9.551,15.697,0.39,6.574,-9.65
T8,-11.635,-9.726,-5.436,-6.19,-22.946,-0.08,-8.796,8.278,-7.554,-8.91,-4.243,-7.879,-9.524,-19.29,1.066,-53.335,2.247,-3.682,2.3,-6.602,0.191,-2.577,-8.675,3.065,-6.677,0.022,-10.776,0.758,1.034,3.379,-18.427,-1.433,-10.861,4.501,2.267,11.19,-4.402,-9.052,5.588,4.081,...,-4.322,12.862,-1.173,-1.531,0.676,-11.032,9.829,3.015,20.049,5.222,-8.105,-8.353,-2.722,8.033,4.473,8.488,-2.626,3.616,6.71,-1.275,-12.096,3.797,-13.161,-4.67,0.823,-14.25,5.101,10.635,-2.377,-7.423,-5.078,-7.722,-21.138,-17.673,2.019,-46.964,-16.684,0.343,-4.835,6.52
T9,3.665,-4.675,7.998,-6.82,9.314,0.54,1.673,12.494,13.34,-3.41,2.459,-3.752,-2.255,-2.187,-5.479,21.302,-18.139,-1.345,1.787,4.216,0.149,1.979,-7.986,2.538,-3.979,2.997,-24.903,-4.313,-18.733,-2.2,1.199,6.733,25.506,19.309,-3.886,12.227,1.095,-5.04,1.319,6.762,...,-5.32,-4.695,0.036,1.861,-3.797,-1.993,-3.041,-36.797,5.211,2.863,12.749,4.534,3.244,15.654,-6.723,-6.717,-5.246,5.495,-7.811,6.19,-18.549,-8.67,-3.197,-3.732,-6.278,-2.541,-12.837,8.427,8.174,-4.812,2.686,8.063,-14.983,-5.274,-14.095,28.005,-20.269,5.66,-8.082,1.863
T10,20.994,-0.788,0.395,-28.475,-3.678,4.218,4.589,-4.172,8.1,12.502,-0.122,1.614,8.033,27.037,-0.377,7.63,18.847,6.169,6.055,12.835,-14.281,3.387,-1.169,4.474,-4.839,-6.458,6.013,2.454,9.663,1.798,5.723,11.722,20.09,-0.569,-1.268,5.995,0.573,7.673,0.274,2.857,...,19.941,0.123,0.008,-2.462,-1.399,-15.318,2.555,6.862,-0.506,-0.363,-7.511,-10.342,-7.186,-6.867,7.517,-5.568,12.569,-7.532,2.295,-5.251,-10.455,-1.788,16.048,-3.061,3.458,5.946,14.441,-2.919,-9.138,-11.09,5.685,-7.892,-5.628,7.271,-20.156,5.385,8.319,8.089,0.428,-6.606


In [20]:
document_numbers = [1, 4, 10]

for document_number in document_numbers:
    top_topics = list(dt_df.columns[np.argsort(-np.absolute(dt_df.iloc[document_number].values))[:3]])
    print('Document #'+str(document_number)+':')
    print('Dominant Topics (top 3):', top_topics)
    print('Paper Summary:')
    print(papers[document_number][:500])
    print()

Document #1:
Dominant Topics (top 3): ['T1', 'T4', 'T6']
Paper Summary:
1 
CONNECTIVITY VERSUS ENTROPY 
Yaser S. Abu-Mostafa 
California Institute of Technology 
Pasadena, CA 91125 
ABSTRACT 
How does the connectivity of a neural network (number of synapses per 
neuron) relate to the complexity of the problems it can handle (measured by 
the entropy)? Switching theory would suggest no relation at all, since all Boolean 
functions can be implemented using a circuit with very low connectivity (e.g., 
using two-input NAND gates). However, for a network that learns a pr

Document #4:
Dominant Topics (top 3): ['T1', 'T8', 'T19']
Paper Summary:
174 
A Neural Network C1A-sifier Based on Coding Theory 
Tzi-Dar Chiueh and Rodney Goodman 
California Institute of Technology, Pasadena, California 91125 
ABSTRACT
The new neural network classifier we propose transforms the 
classification problem into the coding theory problem of decoding a noisy 
codeword. An input vector in the feature space is tr

# Topic Models with Latent Dirichlet Allocation (LDA)

In [26]:
%%time
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components =TOTAL_TOPICS, max_iter=500, max_doc_update_iter=50,
                                      learning_method='online', batch_size=1740, learning_offset=50., 
                                      random_state=42, n_jobs=16)
document_topics = lda_model.fit_transform(cv_features)

CPU times: user 1min 19s, sys: 15.9 s, total: 1min 35s
Wall time: 20min 41s


CPU times: user 13min 14s, sys: 1min 41s, total: 14min 56s
Wall time: 55min 32s

In [None]:
topic_terms = lda_model.components_

In [None]:
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms] #20
topic_keyterms = vocabulary[topic_key_term_idxs]
topics = [', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics,
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, TOTAL_TOPICS+1)])
topics_df

Unnamed: 0,Terms per Topic
Topic1,"neuron, circuit, analog, chip, current, voltage, signal, threshold, bit, noise, vlsi, implementation, channel, gate, pulse, processor, element, synapse, parallel, fig"
Topic2,"image, feature, structure, state, layer, neuron, distribution, local, cell, recognition, node, motion, matrix, net, sequence, object, gaussian, hidden, size, line"
Topic3,"neuron, cell, image, class, state, response, rule, feature, rate, probability, representation, hidden, dynamic, et al, frequency, spike, distribution, component, level, recognition"
Topic4,"cell, neuron, response, visual, stimulus, activity, field, spike, motion, synaptic, direction, frequency, signal, cortex, firing, orientation, spatial, eye, rate, map"
Topic5,"image, feature, recognition, layer, hidden, task, object, speech, trained, representation, test, net, classification, classifier, class, level, architecture, experiment, node, rule"
Topic6,"state, dynamic, rule, matrix, recurrent, equation, gradient, net, signal, fixed, sequence, node, source, attractor, hidden, structure, step, fixed point, component, activation"
Topic7,"sequence, chain, region, structure, markov, protein, prediction, hmms, markov model, hidden markov, site, hidden, gene, class, receptor, length, human, distance, mouse, bengio"
Topic8,"memory, word, context, similarity, item, recall, probability, phoneme, short, representation, association, activation, list, serial, short term, address, term memory, store, proximity, phone"
Topic9,"activation, motor, behavior, winner, take, winner take, competitive, active, command, connection, movement, sensory, feedback, wta, net, sensor, body, activation function, level, self"
Topic10,"state, cell, distribution, neuron, probability, control, response, signal, task, rate, layer, architecture, random, hidden, test, image, change, fig, generalization, field"


In [None]:
pd.options.display.float_format = '{:,.3f}'.format
dt_df = pd.DataFrame(document_topics, 
                     columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1730,1731,1732,1733,1734,1735,1736,1737,1738,1739
T1,0.011,0.137,0.017,0.0,0.219,0.034,0.477,0.218,0.12,0.063,...,0.0,0.0,0.0,0.0,0.0,0.0,0.115,0.0,0.0,0.482
T2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T4,0.028,0.0,0.663,0.773,0.262,0.802,0.03,0.0,0.786,0.269,...,0.0,0.107,0.0,0.0,0.0,0.999,0.0,0.0,0.13,0.518
T5,0.227,0.035,0.045,0.212,0.024,0.086,0.086,0.37,0.089,0.402,...,0.21,0.591,0.043,0.007,0.019,0.0,0.431,0.466,0.0,0.0
T6,0.446,0.579,0.0,0.0,0.238,0.022,0.102,0.022,0.0,0.036,...,0.0,0.094,0.0,0.0,0.33,0.0,0.162,0.0,0.0,0.0
T7,0.0,0.0,0.026,0.015,0.008,0.0,0.0,0.0,0.005,0.0,...,0.0,0.0,0.0,0.003,0.002,0.0,0.0,0.0,0.003,0.0
T8,0.062,0.0,0.0,0.0,0.0,0.017,0.041,0.013,0.0,0.0,...,0.04,0.0,0.0,0.0,0.0,0.0,0.014,0.0,0.0,0.0
T9,0.0,0.025,0.113,0.0,0.0,0.021,0.116,0.0,0.0,0.0,...,0.003,0.0,0.0,0.0,0.0,0.0,0.015,0.0,0.0,0.0
T10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)

max_contrib_topics = dt_df.max(axis=0)
dominant_topics = max_contrib_topics.index
contrib_perc = max_contrib_topics.values
document_numbers = [dt_df[dt_df[t] == max_contrib_topics.loc[t]].index[0]
                       for t in dominant_topics]
documents = [papers[i] for i in document_numbers]

results_df = pd.DataFrame({'Dominant Topic': dominant_topics, 'Contribution %': contrib_perc,
                          'Paper Num': document_numbers, 'Topic': topics_df['Terms per Topic'], 
                          'Paper Name': documents})
results_df

Unnamed: 0,Dominant Topic,Contribution %,Paper Num,Topic,Paper Name
Topic1,T1,0.99938,1122,"neuron, circuit, analog, chip, current, voltage, signal, threshold, bit, noise, vlsi, implementation, channel, gate, pulse, processor, element, synapse, parallel, fig","Improved Silicon Cochlea \nusing \nCompatible Lateral Bipolar Transistors \nAndr6 van Schaik, Eric Fragnire, Eric Vittoz \nMANTRA Center for Neuromimetic Systems \nSwiss Federal Institute of Tech..."
Topic2,T2,0.00033,151,"image, feature, structure, state, layer, neuron, distribution, local, cell, recognition, node, motion, matrix, net, sequence, object, gaussian, hidden, size, line",794 \nNEURAL ARCHITECTURE \nValentino Braitenberg \nMax Planck Institute \nFederal Republic of Germany \nABSTRACT\nWhile we are waiting for the ultimate biophysics of cell membranes and synapses \...
Topic3,T3,0.00033,151,"neuron, cell, image, class, state, response, rule, feature, rate, probability, representation, hidden, dynamic, et al, frequency, spike, distribution, component, level, recognition",794 \nNEURAL ARCHITECTURE \nValentino Braitenberg \nMax Planck Institute \nFederal Republic of Germany \nABSTRACT\nWhile we are waiting for the ultimate biophysics of cell membranes and synapses \...
Topic4,T4,0.99947,1735,"cell, neuron, response, visual, stimulus, activity, field, spike, motion, synaptic, direction, frequency, signal, cortex, firing, orientation, spatial, eye, rate, map",Can V1 mechanisms account for \nfigure-ground and medial axis effects? \nZhaoping Li \nGatsby Computational Neuroscience Unit \nUniversity College London \nzhaopinggat shy. ucl. ac. uk \nAbstract...
Topic5,T5,0.99949,177,"image, feature, recognition, layer, hidden, task, object, speech, trained, representation, test, net, classification, classifier, class, level, architecture, experiment, node, rule","215 \nConsonant Recognition by Modular Construction of \nLarge Phonemic Time-Delay Neural Networks \nAlex Waibel \nCarnegie-Mellon University \nPittsburgh, PA 15213, \nATR Interpreting Telephony R..."
Topic6,T6,0.99684,1128,"state, dynamic, rule, matrix, recurrent, equation, gradient, net, signal, fixed, sequence, node, source, attractor, hidden, structure, step, fixed point, component, activation","Finite State Automata that Recurrent \nCascade-Correlation Cannot Represent \nStefan C. Kremer \nDepartment of Computing Science \nUniversity of Alberta \nEdmonton, Alberta, CANADA T6H 5B5 \nAbstr..."
Topic7,T7,0.99956,283,"sequence, chain, region, structure, markov, protein, prediction, hmms, markov model, hidden markov, site, hidden, gene, class, receptor, length, human, distance, mouse, bengio","A Neural Network to Detect \nHomologies in Proteins \nYoshua Bengio \nSchool of Computer Science \nMcGill University \nMontreal, Canada H3A 2A7 \nSamy Bengio \nDepartement d'Informatique \nUnivers..."
Topic8,T8,0.98167,892,"memory, word, context, similarity, item, recall, probability, phoneme, short, representation, association, activation, list, serial, short term, address, term memory, store, proximity, phone","A solvable connectionist model of \nimmediate recall of ordered lists \nNell Burgess \nDepartment of Anatomy, University College London \nLondon WCiE 6BT, England \n(e-mail: n .burgessucl. ac. uk..."
Topic9,T9,0.99929,227,"activation, motor, behavior, winner, take, winner take, competitive, active, command, connection, movement, sensory, feedback, wta, net, sensor, body, activation function, level, self","44 Beer and Chiei \nNeural \nImplementation of Motivated Behavior: \nFeeding in an Artificial Insect \nRandall D. Beer t,2 and Hillel J. Chiel 2 \nDepartments of t Computer Engineering and Science..."
Topic10,T10,0.00033,151,"state, cell, distribution, neuron, probability, control, response, signal, task, rate, layer, architecture, random, hidden, test, image, change, fig, generalization, field",794 \nNEURAL ARCHITECTURE \nValentino Braitenberg \nMax Planck Institute \nFederal Republic of Germany \nABSTRACT\nWhile we are waiting for the ultimate biophysics of cell membranes and synapses \...


# Topic Models with Non-Negative Matrix Factorization (NMF)

In [None]:
%%time
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=TOTAL_TOPICS, solver='cd', max_iter=500,
                random_state=42, alpha=.1, l1_ratio=.85)
document_topics = nmf_model.fit_transform(cv_features)

CPU times: user 11min 39s, sys: 47.5 s, total: 12min 26s
Wall time: 46.7 s


In [None]:
topic_terms = nmf_model.components_
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterms = vocabulary[topic_key_term_idxs]
topics = [', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics,
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, TOTAL_TOPICS+1)])
topics_df

Unnamed: 0,Terms per Topic
Topic1,"bound, generalization, size, let, optimal, solution, theorem, equation, approximation, class, gradient, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum"
Topic2,"neuron, synaptic, connection, potential, dynamic, synapsis, activity, excitatory, layer, synapse, simulation, inhibitory, delay, biological, equation, state, et, et al, activation, firing"
Topic3,"state, action, policy, step, optimal, reinforcement, transition, reinforcement learning, probability, reward, dynamic, value function, markov, machine, task, agent, finite, iteration, sequence, decision"
Topic4,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, region, visual, representation, transformation, surface"
Topic5,"hidden, layer, net, hidden unit, task, hidden layer, architecture, back, propagation, trained, connection, back propagation, activation, representation, generalization, output unit, neural net, training set, learn, test"
Topic6,"cell, firing, direction, head, rat, response, layer, synaptic, activity, spatial, inhibitory, synapsis, ii, cue, cortex, simulation, lot, active, complex, property"
Topic7,"word, recognition, speech, context, hmm, speaker, speech recognition, character, phoneme, probability, frame, sequence, rate, test, level, acoustic, experiment, letter, segmentation, state"
Topic8,"signal, noise, source, filter, component, frequency, channel, speech, matrix, independent, separation, sound, ica, phase, eeg, blind, auditory, dynamic, delay, fig"
Topic9,"control, controller, trajectory, motor, dynamic, movement, forward, task, feedback, arm, inverse, position, robot, architecture, hand, force, adaptive, change, command, plant"
Topic10,"circuit, chip, current, analog, voltage, vlsi, gate, threshold, transistor, pulse, design, implementation, synapse, bit, digital, device, analog vlsi, element, cmos, pp"


In [None]:
pd.options.display.float_format = '{:,.3f}'.format
dt_df = pd.DataFrame(document_topics, 
                     columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.head(10)

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16,T17,T18,T19,T20
0,0.444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004,0.263,0.0,0.0,0.0,0.0,0.0,3.437
1,0.394,0.595,0.463,0.019,0.187,0.037,0.0,0.228,0.13,0.029,0.0,0.254,0.0,0.0,0.0,0.0,0.0,0.106,0.0,0.21
2,0.032,0.619,0.003,0.067,0.016,0.378,0.029,0.027,0.448,0.0,0.075,0.036,0.024,0.184,0.1,0.0,0.126,0.0,0.656,0.277
3,0.0,0.274,0.0,0.102,0.265,1.019,0.0,0.0,0.0,0.0,0.0,0.0,0.218,0.011,0.0,0.004,1.299,0.291,1.268,0.295
4,0.06,0.188,0.682,0.257,0.167,1.402,0.0,0.093,0.0,0.001,0.0,0.02,1.749,0.037,0.0,0.344,0.0,0.0,0.164,0.121
5,0.0,0.383,0.0,0.0,0.679,7.51,0.016,0.0,0.0,0.326,1.146,1.923,0.098,0.0,0.0,0.202,0.0,0.426,0.646,0.641
6,0.0,1.415,0.02,0.0,0.046,0.044,0.0,0.114,0.333,0.04,0.0,0.032,0.124,0.0,0.041,0.041,0.075,0.03,0.0,0.615
7,0.147,0.029,0.0,0.0,0.274,0.008,0.042,0.0,0.045,0.08,0.008,0.025,0.022,0.009,0.0,0.023,0.0,0.007,0.0,0.096
8,0.084,1.76,0.013,0.012,0.0,1.592,0.0,0.0,0.257,0.068,0.273,0.055,0.122,0.0,0.119,0.0,0.0,0.027,0.514,0.353
9,0.395,0.0,0.04,1.258,0.127,0.0,0.0,0.37,0.075,0.076,0.0,0.042,0.0,0.017,0.0,0.053,0.041,0.133,0.427,0.0


In [None]:
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)

max_score_topics = dt_df.max(axis=0)
dominant_topics = max_score_topics.index
term_score = max_score_topics.values
document_numbers = [dt_df[dt_df[t] == max_score_topics.loc[t]].index[0]
                       for t in dominant_topics]
documents = [papers[i] for i in document_numbers]

results_df = pd.DataFrame({'Dominant Topic': dominant_topics, 'Max Score': term_score,
                          'Paper Num': document_numbers, 'Topic': topics_df['Terms per Topic'], 
                          'Paper Name': documents})
results_df

Unnamed: 0,Dominant Topic,Max Score,Paper Num,Topic,Paper Name
Topic1,T1,1.64138,991,"bound, generalization, size, let, optimal, solution, theorem, equation, approximation, class, gradient, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum","A Bound on the Error of Cross Validation Using \nthe Approximation and Estimation Rates, with \nConsequences for the Training-Test Split \nMichael Kearns \nAT&T Research \nABSTRACT\n1 INTRODUCTION..."
Topic2,T2,3.58149,383,"neuron, synaptic, connection, potential, dynamic, synapsis, activity, excitatory, layer, synapse, simulation, inhibitory, delay, biological, equation, state, et, et al, activation, firing","Signal Processing by Multiplexing and \nDemultiplexing in Neurons \nDavid C. Tam \nDivision of Neuroscience \nBaylor College of Medicine \nHouston, TX 77030 \ndtamCnext-cns.neusc.bcm.tmc.edu \nAb..."
Topic3,T3,5.83072,1167,"state, action, policy, step, optimal, reinforcement, transition, reinforcement learning, probability, reward, dynamic, value function, markov, machine, task, agent, finite, iteration, sequence, de...","Reinforcement Learning for Mixed \nOpen-loop and Closed-loop Control \nEric A. Hansen, Andrew G. Barto, and Shlomo Zilbersteln \nDepartment of Computer Science \nUniversity of Massachusetts \nAmhe..."
Topic4,T4,3.93349,1731,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, region, visual, representation, transformation, surface",Image representations for facial expression \ncoding \nMarian Stewart Bartlett* \nU.C. San Diego \nmarnisalk. edu \nJavier R. Movellan \nU.C. San Diego \nmovellancogsc. ucsd. edu \nPaul Ekman \n...
Topic5,T5,2.9875,33,"hidden, layer, net, hidden unit, task, hidden layer, architecture, back, propagation, trained, connection, back propagation, activation, representation, generalization, output unit, neural net, tr...","5O5 \nCONNECTING TO THE PAST \nBruce A. MacDonald, Assistant Professor \nKnowledge Sciences Laboratory, Computer Science Department \nThe University of Calgary, 2500 University Drive NW \nCalgary,..."
Topic6,T6,7.51003,5,"cell, firing, direction, head, rat, response, layer, synaptic, activity, spatial, inhibitory, synapsis, ii, cue, cortex, simulation, lot, active, complex, property","317 \nPARTITIONING OF SENSORY DATA BY A COPTICAI, NETWOPK  \nRichard Granger, Jos Ambros-Ingerson, Howard Henry, Gary Lynch \nCenter for the Neurobiology of Learning and Memory \nUniversity of..."
Topic7,T7,4.89525,1318,"word, recognition, speech, context, hmm, speaker, speech recognition, character, phoneme, probability, frame, sequence, rate, test, level, acoustic, experiment, letter, segmentation, state","Comparison of Human and Machine Word \nRecognition \nM. Schenkel \nDept of Electrical Eng. \nUniversity of Sydney \nSydney, NSW 2006, Australia \nschenkel@sedal.usyd.edu.au \nC. Latimer \nDept of ..."
Topic8,T8,3.67982,235,"signal, noise, source, filter, component, frequency, channel, speech, matrix, independent, separation, sound, ica, phase, eeg, blind, auditory, dynamic, delay, fig","232 Sejnowski, Yuhas, Goldstein and Jenkins \nCombining Visual and \nwith a Neural Network \nAcoustic Speech Signals \nImproves Intelligibility \nT.J. Sejnowski \nThe Salk Institute \nand \nDepart..."
Topic9,T9,4.88831,948,"control, controller, trajectory, motor, dynamic, movement, forward, task, feedback, arm, inverse, position, robot, architecture, hand, force, adaptive, change, command, plant","An Integrated Architecture of Adaptive Neural Network \nControl for Dynamic Systems \nLiu Ke '2 Robert L. Tokaf Brian D.McVey z \nCenter for Nonlinear Studies, 2Applied Theoretical Physics Divis..."
Topic10,T10,2.95973,1690,"circuit, chip, current, analog, voltage, vlsi, gate, threshold, transistor, pulse, design, implementation, synapse, bit, digital, device, analog vlsi, element, cmos, pp","Kirchoff Law Markov Fields for Analog \nCircuit Design \nRichard M. Golden * \nRMG Consulting Inc. \n2000 Fresno Road, Plano, Texas 75074 \nRMG CONS UL T@A OL. COM, \nwww. neural-network. corn \nA..."


# Predicting Topics for New Research Papers

In [None]:
import glob
# papers manually downloaded from NIPS 16
# https://papers.nips.cc/book/advances-in-neural-information-processing-systems-29-2016

new_paper_files = glob.glob('./test_data/nips16*.txt')
new_papers = []
for fn in new_paper_files:
    with open(fn, encoding='utf-8', errors='ignore', mode='r+') as f:
        data = f.read()
        new_papers.append(data)
              
print('Total New Papers:', len(new_papers))

Total New Papers: 4


In [None]:
norm_new_papers = normalize_corpus(new_papers)
cv_new_features = cv.transform(norm_new_papers)
cv_new_features.shape

(4, 14408)

In [None]:
topic_predictions = nmf_model.transform(cv_new_features)
best_topics = [[(topic, round(sc, 3)) 
                    for topic, sc in sorted(enumerate(topic_predictions[i]), 
                                            key=lambda row: -row[1])[:2]] 
                        for i in range(len(topic_predictions))]
best_topics

[[(0, 1.312), (7, 0.966)],
 [(2, 4.121), (0, 0.864)],
 [(3, 2.154), (1, 1.335)],
 [(3, 3.074), (6, 2.19)]]

In [None]:
results_df = pd.DataFrame()
results_df['Papers'] = range(1, len(new_papers)+1)
results_df['Dominant Topics'] = [[topic_num+1 for topic_num, sc in item] for item in best_topics]
res = results_df.set_index(['Papers'])['Dominant Topics'].apply(pd.Series).stack().reset_index(level=1, drop=True)
results_df = pd.DataFrame({'Dominant Topics': res.values}, index=res.index)
results_df['Topic Score'] = [topic_sc for topic_list in 
                                        [[round(sc*100, 2) 
                                              for topic_num, sc in item] 
                                                 for item in best_topics] 
                                    for topic_sc in topic_list]

results_df['Topic Desc'] = [topics_df.iloc[t-1]['Terms per Topic'] for t in results_df['Dominant Topics'].values]
results_df['Paper Desc'] = [new_papers[i-1][:200] for i in results_df.index.values]

results_df

Unnamed: 0_level_0,Dominant Topics,Topic Score,Topic Desc,Paper Desc
Papers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,131.2,"bound, generalization, size, let, optimal, solution, theorem, equation, approximation, class, gradient, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum","Correlated-PCA: Principal Components’ Analysis\nwhen Data and Noise are Correlated\nNamrata Vaswani and Han Guo\nIowa State University, Ames, IA, USA\nEmail: {namrata,hanguo}@iastate.edu\nAbstract..."
1,8,96.6,"signal, noise, source, filter, component, frequency, channel, speech, matrix, independent, separation, sound, ica, phase, eeg, blind, auditory, dynamic, delay, fig","Correlated-PCA: Principal Components’ Analysis\nwhen Data and Noise are Correlated\nNamrata Vaswani and Han Guo\nIowa State University, Ames, IA, USA\nEmail: {namrata,hanguo}@iastate.edu\nAbstract..."
2,3,412.1,"state, action, policy, step, optimal, reinforcement, transition, reinforcement learning, probability, reward, dynamic, value function, markov, machine, task, agent, finite, iteration, sequence, de...","PAC Reinforcement Learning with Rich Observations\nAkshay Krishnamurthy\nUniversity of Massachusetts, Amherst\nAmherst, MA, 01003\nakshay@cs.umass.edu\nAlekh Agarwal\nMicrosoft Research\nNew York,..."
2,1,86.4,"bound, generalization, size, let, optimal, solution, theorem, equation, approximation, class, gradient, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum","PAC Reinforcement Learning with Rich Observations\nAkshay Krishnamurthy\nUniversity of Massachusetts, Amherst\nAmherst, MA, 01003\nakshay@cs.umass.edu\nAlekh Agarwal\nMicrosoft Research\nNew York,..."
3,4,215.4,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, region, visual, representation, transformation, surface","Automated scalable segmentation of neurons from\nmultispectral images\nUygar Sümbül\nGrossman Center for the Statistics of Mind\nand Dept. of Statistics, Columbia University\nDouglas Roossien Jr.\..."
3,2,133.5,"neuron, synaptic, connection, potential, dynamic, synapsis, activity, excitatory, layer, synapse, simulation, inhibitory, delay, biological, equation, state, et, et al, activation, firing","Automated scalable segmentation of neurons from\nmultispectral images\nUygar Sümbül\nGrossman Center for the Statistics of Mind\nand Dept. of Statistics, Columbia University\nDouglas Roossien Jr.\..."
4,4,307.4,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, region, visual, representation, transformation, surface","Unsupervised Learning of Spoken Language with\nVisual Context\nDavid Harwath, Antonio Torralba, and James R. Glass\nComputer Science and Artificial Intelligence Laboratory\nMassachusetts Institute..."
4,7,219.0,"word, recognition, speech, context, hmm, speaker, speech recognition, character, phoneme, probability, frame, sequence, rate, test, level, acoustic, experiment, letter, segmentation, state","Unsupervised Learning of Spoken Language with\nVisual Context\nDavid Harwath, Antonio Torralba, and James R. Glass\nComputer Science and Artificial Intelligence Laboratory\nMassachusetts Institute..."


# Persisting Model and Transformers

### This is just for visualizing the topics in the other notebook (since PyLDAViz expands the notebook size)

In [None]:
import dill

with open('nmf_model.pkl', 'wb') as f:
    dill.dump(nmf_model, f)
with open('cv_features.pkl', 'wb') as f:
    dill.dump(cv_features, f)
with open('cv.pkl', 'wb') as f:
    dill.dump(cv, f)