In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nips-papers-1987-2019-updated/papers.csv
/kaggle/input/nips-papers-1987-2019-updated/authors.csv


Import all the necessary libraries.

In [80]:
#import packages
import re
import numpy as np
import pandas as pd
from pprint import pprint

#import nltk for stopwords and english words
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = set(nltk.corpus.words.words())

# Gensim packages
import gensim
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

Let's load the data file and have a loot at the structure of it.

In [17]:
nips_papers=pd.read_csv('../input/nips-papers-1987-2019-updated/papers.csv')
nips_papers.head(5)

Unnamed: 0,source_id,year,title,abstract,full_text
0,27,1987,Bit-Serial Neural Networks,,573 \n\nBIT - SERIAL NEURAL NETWORKS \n\nAlan...
1,63,1987,Connectivity Versus Entropy,,1 \n\nCONNECTIVITY VERSUS ENTROPY \n\nYaser S...
2,60,1987,The Hopfield Model with Multi-Level Neurons,,278 \n\nTHE HOPFIELD MODEL WITH MUL TI-LEVEL N...
3,59,1987,How Neural Nets Work,,442 \n\nAlan Lapedes \nRobert Farber \n\nThe...
4,69,1987,Spatial Organization of Neural Networks: A Pro...,,740 \n\nSPATIAL ORGANIZATION OF NEURAL NEn...


We want to see how many papers are in the data...

In [21]:
nips_papers.shape

(9680, 5)

and how the documents look.

In [22]:
text=nips_papers.full_text.values.tolist()
text[1]

'1 \n\nCONNECTIVITY VERSUS ENTROPY \n\nYaser  S.  Abu-Mostafa \n\nCalifornia  Institute  of Technology \n\nPasadena, CA 91125 \n\nABSTRACT \n\nHow  does  the  connectivity  of a  neural  network  (number  of synapses  per \nneuron)  relate  to  the complexity  of the  problems  it  can  handle  (measured  by \nthe entropy)?  Switching theory would suggest no relation at all, since all Boolean \nfunctions  can be  implemented  using  a  circuit  with very  low  connectivity  (e.g., \nusing  two-input  NAND  gates).  However,  for  a  network  that  learns  a  problem \nfrom  examples  using  a  local  learning  rule,  we  prove  that  the  entropy  of  the \nproblem becomes  a  lower  bound for  the connectivity of the network. \n\nINTRODUCTION \n\nThe most  distinguishing feature of neural networks  is  their  ability to spon(cid:173)\n\ntaneously  learn  the  desired  function  from  \'training\' samples,  i.e.,  their  ability \nto  program themselves.  Clearly,  a  given  neural  ne

Cleaning the documents is necessary as they contain a lot of symbols, mathematical language, abreviations and other words that will not give any useful information to the model.

In [23]:
#clean the text using 'simple_process()'. tokenization, pancuation removal, remove unnecessary characters
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

text_words = list(sent_to_words(text))

Bigrams are a good method to make the terms more compact and give meaning to terms that can't stand on their own.

In [24]:
#Build the bigram
bigram = gensim.models.Phrases(text_words, min_count=5, threshold=100) # higher threshold fewer phrases.

#Faster way to get a sentence clubbed as a bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

Let's define the final text cleaning functions to use...

In [28]:
# Define functions for stopwords, bigrams, lemmatization and remove non english words
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'VERB', 'ADV', 'ADJ']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def remove_non_english(texts):
        return [[word for word in doc if word.lower() in words] for doc in texts]

and run them!

In [30]:
%%time

# Remove Stop Words
data_words_nostops = remove_stopwords(text_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'VERB', 'ADJ'])

#Remove non english words
english_text=remove_non_english(data_lemmatized)

# Form Bigrams
data_words_bigrams = make_bigrams(english_text)

CPU times: user 14min 45s, sys: 4.23 s, total: 14min 49s
Wall time: 14min 49s


This is the final form of a document inside our corpus. Looks more neat, without strange symbols and meaningful words.

In [31]:
print(data_words_bigrams[1])

['abstract', 'connectivity', 'neural', 'network', 'number', 'synapsis', 'neuron', 'relate', 'complexity', 'problem', 'handle', 'measure', 'entropy', 'switching', 'theory', 'would', 'suggest', 'relation', 'function', 'implement', 'use', 'circuit', 'low', 'connectivity', 'use', 'input', 'gate', 'network', 'learn', 'problem', 'example', 'use', 'local', 'learning', 'rule', 'prove', 'entropy', 'problem', 'become', 'low', 'bind', 'connectivity', 'network', 'introduction', 'distinguish', 'feature', 'neural', 'network', 'ability', 'learn', 'desire', 'function', 'training', 'sample', 'ability', 'program', 'give', 'neural', 'network', 'can', 'learn', 'function', 'must', 'restriction', 'network', 'learn', 'function', 'obvious', 'restriction', 'independent', 'learning', 'aspect', 'network', 'must', 'big', 'accommodate', 'circuit', 'complex', 'function', 'simulate', 'restriction', 'arise', 'fact', 'network', 'expect', 'learn', 'function', 'design', 'function', 'paper', 'report', 'restriction', 'kin

We need to have an idea of how many unique words are included in the corpus and how frequent each one is.

In [32]:
#Build term frequency dictionary
freq_dict={}
for text in data_words_bigrams:
    for word in text:
        if word in freq_dict:
            freq_dict[word]+=1
        else:
            freq_dict[word]=1
            
print("There are",len(freq_dict),"unique words used in the whole set of papers")

#Frequency of each word
for key, value in sorted(freq_dict.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

There are 23094 unique words used in the whole set of papers
derate: 1
buildable: 1
ouster: 1
udal: 1
conglomeration: 1
bawn: 1
wort: 1
coffin: 1
winder: 1
peter: 1
slake: 1
batrachian: 1
anamorphic: 1
unscored: 1
tume: 1
overhaul: 1
bothersome: 1
vacate: 1
olid: 1
foundry: 1
serially: 1
neutralization: 1
operable: 1
sciatic: 1
pathobiology: 1
ballast: 1
cermet: 1
semiconducting: 1
lumination: 1
gustation: 1
hypaxial: 1
pharyngeal: 1
asor: 1
seel: 1
thermoplastic: 1
auspice: 1
recordable: 1
figurer: 1
tige: 1
cask: 1
apperception: 1
yep: 1
rotatory: 1
tonus: 1
wiser: 1
ampere: 1
thermionic: 1
indium: 1
coon: 1
schule: 1
mown: 1
shallop: 1
laryngoscope: 1
almighty: 1
slitting: 1
flourishingly: 1
handcart: 1
admittable: 1
hasten: 1
concomitance: 1
architectonic: 1
yeat: 1
oblong: 1
mudd: 1
hoise: 1
miracle: 1
seriality: 1
meshwork: 1
centrifugal: 1
lynch: 1
amygdaloid: 1
steepen: 1
naturally: 1
nonessential: 1
abed: 1
colleen: 1
toff: 1
undiminished: 1
postpositional: 1
irritate: 1
ologi

We decide to remove the least frequent ones and keep about 25% of the total terms. That percentage should give the most important information.

In [67]:
#Build function to remove low frequency terms
def remove_low_freq_terms(texts, f):
    return [[word for word in doc if freq_dict[word]>f] for doc in texts]

texts_final=remove_low_freq_terms(data_words_bigrams, 25)

The next step is to finalize the corpus, build the dictionary of the corpus and convert it to Bag Of Words format.

In [70]:
%%time

# Create Corpus
texts=texts_final

# Create Dictionary
id2word = Dictionary(texts)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

CPU times: user 15.9 s, sys: 431 ms, total: 16.3 s
Wall time: 16.3 s


Now we're ready to implement the LDA model through a function that will test the performance of the model for different number of topics.

In [75]:
def compute_coherence_values(dictionary, corpus, texts, topic_count):
    """
    Compute perplexity and c_v coherence scores for various LDA models

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    topic_count : Number of topics

    Returns:
    -------
    model_list : List of LDA topic models
    perplexity_values : Perplexity scores corresponding to the LDA model with respective number of topics and passes
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics and passes
    """
    perplexity_values = []
    coherence_values = []
    model_list = []
    for num_topics in topic_count:            
                model=gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                      num_topics=num_topics,
                                                      id2word=id2word,
                                                      chunksize=300,
                                                      random_state=100,
                                                      update_every=1,
                                                      alpha='auto',
                                                      eta='auto',
                                                      passes=40,
                                                      per_word_topics=True)
                model_list.append(model)
                # Compute Perplexity
                perplexity_score = model.log_perplexity(corpus)
                perplexity_values.append(perplexity_score)  # a measure of how good the model is. lower the better.
                coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
                coherence_score=coherencemodel.get_coherence()
                coherence_values.append(coherence_score)
                print('num_topics:', num_topics, 'Perplexity:', perplexity_score, 'Coherence:', coherence_score )

    return model_list, perplexity_values, coherence_values

In [76]:
%%time

model_list, perplexity_values, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, topic_count = [10,16,20])

num_topics: 10 Perplexity: -6.924524340374783 Coherence: 0.4112421429986789
num_topics: 16 Perplexity: -7.201013788618129 Coherence: 0.4216861221771744
num_topics: 20 Perplexity: -7.306132457672986 Coherence: 0.4312601859918617
CPU times: user 1h 34min 38s, sys: 1h 21min 11s, total: 2h 55min 50s
Wall time: 1h 13min 27s


The model with the highest scores is the one with 20 topics.
These are the top 10 keywords for each of the 20 topics.

In [81]:
# Print the Keyword in the 10 topics
pprint(model_list[-1].print_topics())
doc_lda = model_list[-1][corpus]

[(0,
  '0.026*"model" + 0.024*"response" + 0.022*"cell" + 0.016*"brain" + '
  '0.012*"stimulus" + 0.012*"subject" + 0.012*"population" + 0.011*"trial" + '
  '0.010*"correlation" + 0.010*"show"'),
 (1,
  '0.051*"policy" + 0.032*"action" + 0.029*"agent" + 0.026*"reward" + '
  '0.025*"state" + 0.015*"value" + 0.014*"use" + 0.013*"learn" + '
  '0.013*"reinforcement" + 0.013*"game"'),
 (2,
  '0.042*"gradient" + 0.037*"method" + 0.025*"step" + 0.024*"rate" + '
  '0.023*"optimization" + 0.022*"convergence" + 0.022*"update" + '
  '0.022*"iteration" + 0.019*"use" + 0.019*"stochastic"'),
 (3,
  '0.027*"state" + 0.025*"function" + 0.023*"dynamic" + 0.020*"time" + '
  '0.020*"system" + 0.014*"point" + 0.013*"value" + 0.013*"equation" + '
  '0.012*"process" + 0.011*"transition"'),
 (4,
  '0.044*"matrix" + 0.024*"kernel" + 0.017*"vector" + 0.015*"use" + '
  '0.012*"linear" + 0.011*"norm" + 0.011*"method" + 0.011*"sparse" + '
  '0.010*"dimensional" + 0.009*"rank"'),
 (5,
  '0.088*"image" + 0.029*"obj

Finally we visualize the topics using pyLDAvis tool.

In [82]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(model_list[-1], corpus, id2word)
vis