In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
import gensim
import pprint

from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
class config:
    DEBUG = False
    SEED = 3407
    SAMPLE_SIZE = 1000

In [3]:
df = pd.read_csv("../input/nips-papers/papers.csv")

print(df.shape)
df.head()

(7241, 7)


Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [4]:
if config.DEBUG:
    df = df.sample(config.SAMPLE_SIZE, random_state=config.SEED)

In [5]:
def clean_text(text):
    
    # convert to lower
    text = text.lower()
    
    # only keep alpha characters
    text_alpha_num = re.sub("[^a-z]", " ", text)
    
    # remove stopwords and use stemming
    lemmatizer = nltk.stem.WordNetLemmatizer()
    stopwords_list = set(nltk.corpus.stopwords.words("english"))
    stopwords_list.update(['href', 'br'])
    
    word_list = nltk.tokenize.word_tokenize(text_alpha_num)
    words = [lemmatizer.lemmatize(y, pos='v') for y in word_list if y not in stopwords_list and len(y) > 3]
    
    text_clean = " ".join(words)
    
    return text_clean

In [6]:
df['paper_text_clean'] = df['paper_text'].progress_apply(clean_text)

  0%|          | 0/7241 [00:00<?, ?it/s]

In [7]:
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text,paper_text_clean
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,self organization associative database applica...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...,mean field theory layer visual cortex applicat...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...,store covariance associative long term potenti...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...,bayesian query construction neural network mod...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a...",neural network ensembles cross validation acti...


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7241 entries, 0 to 7240
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                7241 non-null   int64 
 1   year              7241 non-null   int64 
 2   title             7241 non-null   object
 3   event_type        2422 non-null   object
 4   pdf_name          7241 non-null   object
 5   abstract          7241 non-null   object
 6   paper_text        7241 non-null   object
 7   paper_text_clean  7241 non-null   object
dtypes: int64(2), object(6)
memory usage: 452.7+ KB


In [9]:
clean_corpus = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    clean_corpus.append(row['paper_text_clean'].split(' '))
    
clean_corpus[0][:20]

  0%|          | 0/7241 [00:00<?, ?it/s]

['self',
 'organization',
 'associative',
 'database',
 'applications',
 'hisashi',
 'suzuki',
 'suguru',
 'arimoto',
 'osaka',
 'university',
 'toyonaka',
 'osaka',
 'japan',
 'abstract',
 'efficient',
 'method',
 'self',
 'organize',
 'associative']

In [10]:
id2word = gensim.corpora.Dictionary(clean_corpus)
id2word.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)

# Create Corpus
texts = clean_corpus

# Term Document Frequency
bow_corpus = [id2word.doc2bow(text) for text in texts]

# View
print(bow_corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 6), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 13), (12, 1), (13, 4), (14, 5), (15, 1), (16, 1), (17, 1), (18, 1), (19, 11), (20, 3), (21, 1), (22, 1), (23, 1), (24, 3), (25, 1), (26, 1), (27, 1), (28, 2), (29, 2), (30, 1), (31, 1), (32, 3), (33, 1), (34, 2), (35, 1), (36, 1), (37, 1), (38, 3), (39, 2), (40, 1), (41, 1), (42, 7), (43, 2), (44, 2), (45, 2), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 2), (53, 1), (54, 3), (55, 1), (56, 2), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 2), (68, 1), (69, 1), (70, 2), (71, 1), (72, 3), (73, 2), (74, 2), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 2), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 3), (96, 1), (97, 1), (98, 1), (99, 1), (100, 2), (101, 1), (102, 1), (103, 1), (104, 2), (105, 1), (106, 1), (107, 1), (108, 5), (109, 2), (110, 

In [11]:
# Build LDA model
lda_model_bow = gensim.models.LdaMulticore(
    bow_corpus, 
    num_topics=10, 
    id2word=id2word, 
    passes=2, 
    workers=4,
    random_state=config.SEED
)

In [12]:
for idx, topic in lda_model_bow.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))
    print()

Topic: 0 Word: 0.012*"regret" + 0.006*"bandit" + 0.005*"arm" + 0.005*"hash" + 0.004*"motor" + 0.003*"vertices" + 0.003*"expert" + 0.003*"topics" + 0.003*"player" + 0.003*"vertex"

Topic: 1 Word: 0.004*"lasso" + 0.004*"tensor" + 0.003*"stream" + 0.003*"regret" + 0.003*"speaker" + 0.003*"parse" + 0.002*"sentence" + 0.002*"utility" + 0.002*"moments" + 0.002*"submodular"

Topic: 2 Word: 0.004*"policies" + 0.003*"fire" + 0.003*"trajectories" + 0.003*"oracle" + 0.002*"poisson" + 0.002*"agent" + 0.002*"analog" + 0.002*"motor" + 0.002*"wavelet" + 0.002*"receptive"

Topic: 3 Word: 0.007*"causal" + 0.004*"agent" + 0.004*"parent" + 0.003*"message" + 0.003*"dirichlet" + 0.003*"fire" + 0.003*"head" + 0.002*"sensor" + 0.002*"character" + 0.002*"vertices"

Topic: 4 Word: 0.010*"regret" + 0.006*"patch" + 0.005*"agent" + 0.005*"contour" + 0.003*"agents" + 0.003*"chip" + 0.003*"price" + 0.002*"auction" + 0.002*"mdps" + 0.002*"arm"

Topic: 5 Word: 0.008*"tensor" + 0.005*"synaptic" + 0.004*"fire" + 0.004*

In [13]:
tfidf = gensim.models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf, 
    num_topics=10, 
    id2word=id2word, 
    passes=2, 
    workers=4,
    random_state=config.SEED
)

In [14]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))
    print()

Topic: 0 Word: 0.002*"regret" + 0.002*"motor" + 0.001*"expert" + 0.001*"robot" + 0.001*"recovery" + 0.001*"topics" + 0.001*"lasso" + 0.001*"hash" + 0.001*"trajectories" + 0.001*"anneal"

Topic: 1 Word: 0.002*"tensor" + 0.001*"regret" + 0.001*"message" + 0.001*"fire" + 0.001*"video" + 0.001*"lasso" + 0.001*"synaptic" + 0.001*"marginals" + 0.001*"bethe" + 0.001*"moments"

Topic: 2 Word: 0.002*"fire" + 0.001*"poisson" + 0.001*"wavelet" + 0.001*"agent" + 0.001*"oracle" + 0.001*"mutual" + 0.001*"warp" + 0.001*"receptive" + 0.001*"policies" + 0.001*"robot"

Topic: 3 Word: 0.002*"causal" + 0.002*"message" + 0.001*"parent" + 0.001*"agent" + 0.001*"vertex" + 0.001*"dirichlet" + 0.001*"manifold" + 0.001*"fire" + 0.001*"sampler" + 0.001*"mcmc"

Topic: 4 Word: 0.004*"regret" + 0.002*"lasso" + 0.002*"bandit" + 0.002*"arm" + 0.002*"contour" + 0.001*"patch" + 0.001*"learner" + 0.001*"bandits" + 0.001*"agent" + 0.001*"unlabeled"

Topic: 5 Word: 0.002*"tensor" + 0.001*"unlabeled" + 0.001*"completion" +

# Test on one paper

In [15]:
clean_corpus[43][:20]

['quadratic',
 'type',
 'lyapunov',
 'function',
 'competitive',
 'neural',
 'network',
 'different',
 'time',
 'scale',
 'anke',
 'meyer',
 'base',
 'institute',
 'technical',
 'informatics',
 'technical',
 'university',
 'darmstadt',
 'darmstadt']

In [16]:
for index, score in sorted(lda_model_bow[bow_corpus[43]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_bow.print_topic(index, 10)))


Score: 0.8058199286460876	 
Topic: 0.008*"tensor" + 0.005*"synaptic" + 0.004*"fire" + 0.004*"unlabeled" + 0.004*"completion" + 0.003*"cortical" + 0.003*"recovery" + 0.002*"poisson" + 0.002*"privacy" + 0.002*"excitatory"

Score: 0.18537653982639313	 
Topic: 0.005*"regret" + 0.003*"saliency" + 0.003*"ensemble" + 0.002*"convolutional" + 0.002*"analog" + 0.002*"chip" + 0.002*"teacher" + 0.002*"mask" + 0.002*"receptive" + 0.002*"prune"


In [17]:
for index, score in sorted(lda_model_tfidf[corpus_tfidf[43]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.7367550134658813	 
Topic: 0.004*"regret" + 0.002*"lasso" + 0.002*"bandit" + 0.002*"arm" + 0.002*"contour" + 0.001*"patch" + 0.001*"learner" + 0.001*"bandits" + 0.001*"agent" + 0.001*"unlabeled"

Score: 0.1184585839509964	 
Topic: 0.003*"synaptic" + 0.002*"fire" + 0.002*"synapses" + 0.002*"gate" + 0.001*"synapse" + 0.001*"membrane" + 0.001*"regret" + 0.001*"postsynaptic" + 0.001*"submodular" + 0.001*"policies"

Score: 0.018099350854754448	 
Topic: 0.002*"causal" + 0.002*"message" + 0.001*"parent" + 0.001*"agent" + 0.001*"vertex" + 0.001*"dirichlet" + 0.001*"manifold" + 0.001*"fire" + 0.001*"sampler" + 0.001*"mcmc"

Score: 0.01809931918978691	 
Topic: 0.002*"tensor" + 0.001*"unlabeled" + 0.001*"completion" + 0.001*"manifold" + 0.001*"proximal" + 0.001*"lasso" + 0.001*"synaptic" + 0.001*"fire" + 0.001*"privacy" + 0.001*"admm"

Score: 0.018099239096045494	 
Topic: 0.003*"regret" + 0.002*"chip" + 0.002*"agent" + 0.001*"learner" + 0.001*"adaboost" + 0.001*"analog" + 0.001*"convolut