In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import timeit
import re             # regular expression library
from wordcloud import WordCloud
from wordcloud import WordCloud,STOPWORDS
from nltk.corpus import stopwords
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize
from collections import Counter
from gensim.models.phrases import Phrases, Phraser
#from gensim.corpora.dictionary import Dictionary
from gensim.models import TfidfModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
pwd

'/Users/Robert/DSI/Capstone_3'

In [3]:
# Read datasets/papers.csv into papers
papers = pd.read_csv('nips-papers/papers.csv')


In [4]:
papers.head(3)

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...


In [5]:
papers.tail(3)

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
7238,997,1994,A Real Time Clustering CMOS Neural Engine,,997-a-real-time-clustering-cmos-neural-engine.pdf,Abstract Missing,A Real Time Clustering CMOS\nNeural Engine\nT....
7239,998,1994,Learning direction in global motion: two class...,,998-learning-direction-in-global-motion-two-cl...,Abstract Missing,Learning direction in global motion: two\nclas...
7240,999,1994,Correlation and Interpolation Networks for Rea...,,999-correlation-and-interpolation-networks-for...,Abstract Missing,Correlation and Interpolation Networks for\nRe...


In [6]:
papers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7241 entries, 0 to 7240
Data columns (total 7 columns):
id            7241 non-null int64
year          7241 non-null int64
title         7241 non-null object
event_type    2422 non-null object
pdf_name      7241 non-null object
abstract      7241 non-null object
paper_text    7241 non-null object
dtypes: int64(2), object(5)
memory usage: 396.1+ KB


In [7]:
drop_cols = ['id', 'event_type', 'pdf_name']
papers.drop(columns=drop_cols, axis=1, inplace=True)

In [8]:
def make_df_LDA_period(low, high):
    '''make a dataframe given upper and lower year bounds'''
 
    df = papers[(papers.year >= low) & (papers.year <= high)]
    df.reset_index(inplace=True)
    print('{0:4d} thru {1:4d} has{2:4d} rows'.format(low, high, len(df)))
          
    return df

In [9]:
papers_87_88 = make_df_LDA_period(1987, 1988)

papers_89_90 = make_df_LDA_period(1989, 1990)

papers_91_92 = make_df_LDA_period(1991, 1992)

papers_93_94 = make_df_LDA_period(1993, 1994)

papers_95_96 = make_df_LDA_period(1995, 1996)

papers_97_98 = make_df_LDA_period(1997, 1998)

papers_99_00 = make_df_LDA_period(1999, 2000)

papers_01_02 = make_df_LDA_period(2001, 2002)

papers_03_04 = make_df_LDA_period(2003, 2004)

papers_05_06 = make_df_LDA_period(2005, 2006)
papers_07_08 = make_df_LDA_period(2007, 2008)
papers_09_10 = make_df_LDA_period(2009, 2010)
papers_11_12 = make_df_LDA_period(2011, 2012)
papers_13_14 = make_df_LDA_period(2013, 2014)
papers_15_16 = make_df_LDA_period(2015, 2016)
papers_17 = make_df_LDA_period(2017, 2017)

1987 thru 1988 has 184 rows
1989 thru 1990 has 244 rows
1991 thru 1992 has 271 rows
1993 thru 1994 has 298 rows
1995 thru 1996 has 304 rows
1997 thru 1998 has 301 rows
1999 thru 2000 has 302 rows
2001 thru 2002 has 404 rows
2003 thru 2004 has 405 rows
2005 thru 2006 has 411 rows
2007 thru 2008 has 467 rows
2009 thru 2010 has 554 rows
2011 thru 2012 has 674 rows
2013 thru 2014 has 771 rows
2015 thru 2016 has 972 rows
2017 thru 2017 has 679 rows


In [11]:
all_df = [papers_87_88, 
papers_89_90, 
papers_91_92,
papers_93_94,
papers_95_96, 
papers_97_98, 
papers_99_00, 
papers_01_02, 
papers_03_04, 
papers_05_06, 
papers_07_08,
papers_09_10, 
papers_11_12, 
papers_13_14, 
papers_15_16, 
papers_17] 

In [33]:
count = 0
for d in all_df:
    count += len(d)
print(count)
len(all_df)

7241


16

In [13]:
papers_09_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554 entries, 0 to 553
Data columns (total 5 columns):
index         554 non-null int64
year          554 non-null int64
title         554 non-null object
abstract      554 non-null object
paper_text    554 non-null object
dtypes: int64(2), object(3)
memory usage: 21.7+ KB


# Preprocess text

In [15]:
# Use stopwords from "nltk.corpus"
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) # sets are efficient
stop_words.update(["this", "that", "thus", "from", "does", "example", "however", "since", "given", 
                    "et", "al", "hence", "therefore", "use", "used", "note", "corresponding", 
                    "setting", "well", "one", "form", "using", "second", "even", "known", "either",
                    "consider", "particular", "general", "represent", "case", "output", "defined",
                    "rather", "though", "although", "set", "problem", "function", "figure", "given",
                    "results", "number", "time", "data", "results", "given", "information", "different",
                    "models", "method", "n", "j", "r", "l", "mj", "a", "cj", "zero", "iv", "is", "elsewhere",
                    "large", "may", "finally", "c", "describing", "l", "cells", "single", "field",
                    "approximation", "activity", "mean", "input"])


In [16]:
%%time
# This function removes process the stopwords. Takes time, 3.5 mins
def rem_stopwords(txt):
    words = txt.split()
    ret_word = words.copy()
    for w in words: 
        if w.lower() in stop_words:
            ret_word.remove(w)
    return (" ".join(ret_word))

for df in all_df:
    df['text_processed'] = df['paper_text'].map(rem_stopwords)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


CPU times: user 3min 33s, sys: 201 ms, total: 3min 33s
Wall time: 3min 33s


In [17]:
%%time
# This cell takes a long time, longer than 3 mins(seems slower)
# #Print the text of the first 3 rows 
# print(papers_thru_2000['paper_text'].head(3))

# Remove punctuation
for df in all_df:
    df['text_processed'] = df['text_processed'].map(lambda text: [w for w in word_tokenize(text.lower()) 
                      if w.isalpha()])
    df['text_string'] = [" ".join(map(str, l)) for l in df['text_processed']]
    # # Print the processed titles of the first rows 
    print(df.head(2))
    print()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


   index  year                                              title  \
0      0  1987  Self-Organization of Associative Database and ...   
1      1  1987  A Mean Field Theory of Layer IV of Visual Cort...   

           abstract                                         paper_text  \
0  Abstract Missing  767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...   
1  Abstract Missing  683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...   

                                      text_processed  \
0  [associative, database, applications, hisashi,...   
1  [theory, layer, visual, cortex, application, a...   

                                         text_string  
0  associative database applications hisashi suzu...  
1  theory layer visual cortex application artific...  

   index  year                                              title  \
0    984  1989     Contour-Map Encoding of Shape for Early Vision   
1    995  1989  Neural Network Weight Matrix Synthesis Using O...   

           abstract       

   index  year                                              title  \
0   2899  2009                   Kernel Methods for Deep Learning   
1   2900  2009  Distribution-Calibrated Hierarchical Classific...   

                                            abstract  \
0  We introduce a new family of positive-definite...   
1  While many advances have already been made on ...   

                                          paper_text  \
0  Kernel Methods for Deep Learning\n\nYoungmin C...   
1  Distribution-Calibrated Hierarchical Classific...   

                                      text_processed  \
0  [kernel, methods, deep, learning, youngmin, ch...   
1  [hierarchical, classification, ofer, dekel, mi...   

                                         text_string  
0  kernel methods deep learning youngmin cho lawr...  
1  hierarchical classification ofer dekel microso...  

   index  year                                              title  \
0   3514  2011   A Non-Parametric Approach to Dyna

In [69]:
# %%time
# for df in all_df:
#     df['text_string'] = [" ".join(map(str, l)) for l in df['text_processed']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


CPU times: user 5.22 s, sys: 30.5 ms, total: 5.25 s
Wall time: 5.25 s


In [18]:
# # Use stopwords from "nltk.corpus"
# from nltk.corpus import stopwords
# stop_words = set(stopwords.words('english')) # sets are efficient
STOPWORDS.update(["this", "that", "thus", "from", "does", "example", "however", "since", "given", 
                    "et", "al", "hence", "therefore", "use", "used", "note", "corresponding", 
                    "setting", "well", "one", "form", "using", "second", "even", "known", "either",
                    "consider", "particular", "general", "represent", "case", "output", "defined",
                    "rather", "though", "although", "set", "problem", "function", "figure", "given",
                    "results", "number", "time", "data", "results", "given", "information", "different",
                    "models", "method", "n", "j", "r", "l", "mj", "a", "cj", "zero", "iv", "is", "elsewhere",
                    "large", "may", "finally", "c", "describing", "l", "cells", "single", "field",
                    "approximation", "activity", "mean", "input", "x", "f", "here", "y", "means", "make", "say",
                    "that", "then", "good", "clarify", "b", "t", "layers", "s", "e", "units", "unit", "input", "h", 
                    "z", "p", "xt", "items", "player", "two", "item", "let", "d", "w", "ti", "m", "g", "local",
                    "q", "actions", "users", "system", "regret", "user", "xi", "rule",
                    "eq", "kl", "em", "v", "u", "first", "entries", "optimal", "error", "o", "result",
                    "see", "section", "update", "theorem", "variables", "rl", "st", "size", "based", "fig", "source",
                    "sources", "step", "o", "xi", "vi", "approach", "yt", "c", "new", "unit", "systems", "points", "point",
                    "test", "values", "value", "fi", "examples", "representation", "processing", "shown", "inputs", "initial",
                    "many", "problems", "speaker", "yi", "effect", "gp", "ya", "ylx", "algorithm", "pa", "ga"])
print(len(STOPWORDS))

340


In [19]:

def tokenize(text):
    return [token for token in text if token not in STOPWORDS]

In [26]:

def token_2(df):
    '''input df, returns tokenized text for that df'''
    paps = df.text_processed
    documents = paps.tolist()
    texts = []
    for doc in documents:
        texts.append(tokenize(doc))
    return(texts)

In [27]:
type(token_2)

function

In [28]:
texts_87_88 =  token_2(papers_87_88)
texts_89_90 =  token_2(papers_89_90) 
texts_91_92 =  token_2(papers_91_92)
texts_93_94 =  token_2(papers_93_94)
texts_95_96 =  token_2(papers_95_96) 
texts_97_98 =  token_2(papers_97_98) 

texts_99_00 =  token_2(papers_99_00) 
texts_01_02 =  token_2(papers_01_02) 
texts_03_04 =  token_2(papers_03_04) 

texts_05_06 =  token_2(papers_05_06) 
texts_07_08 =  token_2(papers_07_08)
texts_09_10 =  token_2(papers_09_10) 
texts_11_12 =  token_2(papers_11_12) 
texts_13_14 =  token_2(papers_13_14) 
texts_15_16 =  token_2(papers_15_16) 
texts_17 =  token_2(papers_17)

In [30]:
len(texts_17[0])

2816

In [32]:
all_texts = [
texts_87_88, 
texts_89_90, 
texts_91_92,

texts_93_94,

texts_95_96, 
texts_97_98, 

texts_99_00, 
texts_01_02, 
texts_03_04, 

texts_05_06, 
texts_07_08,
texts_09_10, 
texts_11_12, 
texts_13_14, 
texts_15_16, 
texts_17] 
len(all_texts)

16

In [199]:
print(type(texts))
len(texts)

<class 'list'>


755

In [34]:
trg_sent = [['machine','learning','neural','network','dummy','neural','networks', "deep", "learning", "dum", "expert", "system", "expert","systems" ], 
            ['neural','networks', 'dummy','neural','network','machine','learning',"deep", "learning", "expert", "system", "expert","systems"]]
phrases = Phrases(trg_sent, min_count=1, threshold=1)
bigram = Phraser(phrases)
test_sent = ['machine','learning','neural','network','neural','networks', "deep", "learning", "expert", "system", "expert", "system"]
print(bigram[test_sent])

['machine_learning', 'neural_network', 'neural_networks', 'deep_learning', 'expert_system', 'expert_system']


In [39]:
def corpus_1(texts):
    corpus = []
    for paper in texts:
        bigram
        corpus.append(bigram[paper])
    return(corpus)

In [40]:
corpus_87_88  = corpus_1(texts_87_88)
corpus_89_90  = corpus_1(texts_89_90)
corpus_91_92  = corpus_1(texts_91_92)
corpus_93_94  = corpus_1(texts_93_94)
corpus_95_96  = corpus_1(texts_95_96)
corpus_97_98  = corpus_1(texts_97_98)
corpus_99_00  = corpus_1(texts_99_00)
corpus_01_02  = corpus_1(texts_01_02)
corpus_03_04  = corpus_1(texts_03_04) 
corpus_05_06  = corpus_1(texts_05_06)
corpus_07_08  = corpus_1(texts_07_08)
corpus_09_10  = corpus_1(texts_09_10)
corpus_11_12  = corpus_1(texts_11_12)
corpus_13_14  = corpus_1(texts_13_14)
corpus_15_16  = corpus_1(texts_15_16)
corpus_17  = corpus_1(texts_17)

In [42]:
#print(texts[5])
corpus_17[111]

['decoders',
 'encoding',
 'future',
 'recurrent',
 'networks',
 'arun',
 'nicholas',
 'wen',
 'lerrel',
 'pinto',
 'martial',
 'byron',
 'kris',
 'andrew',
 'robotics',
 'institute',
 'university',
 'pittsburgh',
 'pa',
 'school',
 'interactive',
 'computing',
 'georgia',
 'institute',
 'technology',
 'atlanta',
 'ga',
 'abstract',
 'recurrent',
 'neural_networks',
 'rnns',
 'vital',
 'modeling',
 'technique',
 'rely',
 'internal',
 'states',
 'learned',
 'indirectly',
 'optimization',
 'supervised',
 'unsupervised',
 'reinforcement',
 'training',
 'loss',
 'rnns',
 'model',
 'dynamic',
 'processes',
 'characterized',
 'underlying',
 'latent',
 'states',
 'whose',
 'often',
 'unknown',
 'precluding',
 'analytic',
 'inside',
 'rnn',
 'psr',
 'literature',
 'latent',
 'state',
 'processes',
 'modeled',
 'internal',
 'state',
 'directly',
 'distribution',
 'future',
 'observations',
 'recent',
 'work',
 'area',
 'relied',
 'explicitly',
 'representing',
 'targeting',
 'sufficient',
 'sta

In [202]:
corpus[2]

['architectural',
 'mechanism',
 'cortical',
 'simple',
 'role',
 'mutual',
 'inhibition',
 'silvio',
 'sabatini',
 'silvio',
 'fabio',
 'solari',
 'fabio',
 'dibe',
 'giacomo',
 'bisio',
 'bisio',
 'department',
 'biophysical',
 'electronic',
 'engineering',
 'pspc',
 'research',
 'group',
 'genova',
 'italy',
 'abstract',
 'linear',
 'architectural',
 'model',
 'cortical',
 'simple',
 'presented',
 'model',
 'evidences',
 'mutual',
 'inhibition',
 'occurring',
 'synaptic',
 'coupling',
 'functions',
 'asymmetrically',
 'distributed',
 'space',
 'possible',
 'basis',
 'wide',
 'variety',
 'simple',
 'cell',
 'response',
 'properties',
 'including',
 'direction',
 'selectivity',
 'velocity',
 'tuning',
 'spatial',
 'asymmetries',
 'included',
 'explicitly',
 'structure',
 'inhibitory',
 'interconnections',
 'temporal',
 'asymmetries',
 'originate',
 'specific',
 'mutual',
 'inhibition',
 'scheme',
 'considered',
 'extensive',
 'simulations',
 'supporting',
 'model',
 'reported',
 'intr

In [50]:
from collections import defaultdict
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from pprint import pprint

def bow_LDA_topics(corpus):
    frequency = defaultdict(int)
    for text in corpus:
        for token in text:
            frequency[token] += 1
    texts_2 = [[token for token in text if frequency[token] > 10] for text in corpus]
    return(texts_2)
    

In [54]:
texts_2_87_88  = bow_LDA_topics(corpus_87_88)
texts_2_89_90  = bow_LDA_topics(corpus_89_90)
texts_2_91_92  = bow_LDA_topics(corpus_91_92)

texts_2_93_94  = bow_LDA_topics(corpus_93_94)

texts_2_95_96  = bow_LDA_topics(corpus_95_96)
texts_2_97_98  = bow_LDA_topics(corpus_97_98)

texts_2_99_00  = bow_LDA_topics(corpus_99_00)
texts_2_01_02  = bow_LDA_topics(corpus_01_02)
texts_2_03_04  = bow_LDA_topics(corpus_03_04) 

texts_2_05_06  = bow_LDA_topics(corpus_05_06)
texts_2_07_08  = bow_LDA_topics(corpus_07_08)
texts_2_09_10  = bow_LDA_topics(corpus_09_10)
texts_2_11_12  = bow_LDA_topics(corpus_11_12)
texts_2_13_14  = bow_LDA_topics(corpus_13_14)
texts_2_15_16  = bow_LDA_topics(corpus_15_16)
texts_2_17  = bow_LDA_topics(corpus_17)                             

In [59]:
def build_corpora(texts_2):
    '''returns dictionary and corpus'''
    dictionary = corpora.Dictionary(texts_2)
    corpus_test = []
    for text in texts_2:
        corpus_test.append(dictionary.doc2bow(text))
    return(dictionary, corpus_test)

In [63]:
dict_87_88 , corp_87_88  = build_corpora(texts_2_87_88)
dict_89_90 , corp_89_90  = build_corpora(texts_2_89_90)
dict_91_92 , corp_91_92  = build_corpora(texts_2_91_92)

dict_93_94 , corp_93_94  = build_corpora(texts_2_93_94)

dict_95_96 , corp_95_96  = build_corpora(texts_2_95_96)
dict_97_98 , corp_97_98  = build_corpora(texts_2_97_98)

dict_99_00 , corp_99_00  = build_corpora(texts_2_99_00)
dict_01_02 , corp_01_02  = build_corpora(texts_2_01_02)
dict_03_04 , corp_03_04  = build_corpora(texts_2_03_04) 

dict_05_06 , corp_05_06  = build_corpora(texts_2_05_06)
dict_07_08 , corp_07_08  = build_corpora(texts_2_07_08)
dict_09_10 , corp_09_10  = build_corpora(texts_2_09_10)
dict_11_12 , corp_11_12  = build_corpora(texts_2_11_12)
dict_13_14 , corp_13_14  = build_corpora(texts_2_13_14)
dict_15_16 , corp_15_16  = build_corpora(texts_2_15_16)
dict_17 , corp_17  = build_corpora(texts_2_17)

In [67]:
# # #____________________________________________________________________________________________________________

# fit gensim's LDA model
# the number of topics should be optimized iteratively
topics_87_88 = LdaModel(corpus=corp_87_88,
                           id2word=dict_87_88,
                           num_topics=15,
                           passes=10)

In [68]:
# print out first 10 topics
for i, topic in enumerate(topics_87_88.print_topics(15)):
    pprint("{} --- {}".format(i, topic))
    print()


('0 --- (0, \'0.020*"image" + 0.010*"recognition" + 0.008*"vector" + '
 '0.007*"images" + 0.006*"memory" + 0.006*"learning" + 0.006*"operator" + '
 '0.005*"object" + 0.005*"color" + 0.005*"pixels"\')')

('1 --- (1, \'0.009*"neural" + 0.008*"state" + 0.007*"learning" + '
 '0.007*"neuron" + 0.006*"current" + 0.006*"signal" + 0.005*"neurons" + '
 '0.005*"action" + 0.005*"net" + 0.005*"network"\')')

('2 --- (2, \'0.013*"memory" + 0.009*"neurons" + 0.009*"chip" + 0.008*"neural" '
 '+ 0.008*"neuron" + 0.008*"weight" + 0.008*"stored" + 0.006*"connection" + '
 '0.006*"parallel" + 0.006*"analog"\')')

('3 --- (3, \'0.019*"network" + 0.013*"learning" + 0.012*"memory" + '
 '0.011*"capacity" + 0.010*"model" + 0.009*"patterns" + 0.008*"orientation" + '
 '0.007*"space" + 0.006*"states" + 0.006*"signal"\')')

('4 --- (4, \'0.016*"model" + 0.011*"network" + 0.011*"response" + '
 '0.009*"stimulus" + 0.008*"patterns" + 0.007*"cell" + 0.007*"neurons" + '
 '0.007*"activation" + 0.007*"potential" + 0.006*

In [69]:
# Takes about 5-10 minutes, depends how many papers
import pyLDAvis.gensim as gensimvis
import pyLDAvis
vis_data = gensimvis.prepare(topics_87_88, corp_87_88, dict_87_88)
pyLDAvis.display(vis_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
