# Notebook to experiment with topic model

In [289]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

import nltk

nltk.download('wordnet')
from nltk.corpus import wordnet as wn
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /Users/ellen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ellen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [290]:
import load_pdfs, read_pdf, topic_model

### Part 1 Load PDFs from folder 'papers'

In [303]:
def getFileNames():
    foldername = 'papers'
    file_names = os.listdir(foldername)
    return foldername, file_names

In [304]:
foldername, filenames = load_pdfs.getFileNames()

try:
    for each in filenames:
        print(each)
        print('\n')
except:
    pass

Hybrid Recommender Systems.pdf


Information Theory - Tutorial.pdf


Geometric Understanding of Deep Learning.pdf


1802.05968v2.pdf


Time Series Feature Extraction.pdf


An Introduction to DRL.pdf


Multitask Learning as Multiobjective Optimization.pdf


GANs.pdf


Introduction to Transfer Learning.pdf


Deep CNN Design Patterns.pdf




### Part 2 Retrieve the text from the PDF files

In [305]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, TextConverter
from pdfminer.layout import LAParams
import io

def pdfparser(data):
    fp = open(data, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data = retstr.getvalue()
    return data


In [306]:
text = pdfparser(foldername+'/'+filenames[1])

In [307]:
tokens = topic_model.prepare_text_for_lda(text)

In [308]:
# Calculate frequency distribution
fdist = nltk.FreqDist(tokens)

# Output top 50 words

for word, frequency in fdist.most_common(10):
    print(u'{};{}'.format(word, frequency))

information;133
entropy;84
channel;74
variable;51
shannon;47
noise;47
input;47
output;43
theory;42
value;39


# Remove punctuation, newline chars, stopwords etc

In [310]:
print(len(text))
print(text[420:425])
tokens = nlp(text)
print(len(tokens))
print(tokens[420:425])

43507
 comp


10153


channel (Figure 1


In [313]:
from nltk.corpus import stopwords
STOPLIST = set(stopwords.words('english'))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]
punctuations = string.punctuation

def clean_up(tokens):
    clean_tokens = []
    for each in tokens:
        if each.is_stop == False:
            if each.is_punct == False:
                clean_tokens.append(each)
    final_tokens = [tok.lemma_.lower().strip() for tok in clean_tokens if tok.lemma_ != '-PRON-']
    final_tokens = [tok for tok in final_tokens if tok not in STOPLIST and tok not in punctuations]
    final_tokens = ' '.join(final_tokens)
    return final_tokens

# cleaned_tokens = []
# for each in spacy_tokens:
#     if each.is_stop == False:
#        cleaned_tokens.append(each)
            
# print(cleaned_tokens.type)
# spacy_tokens = nlp(str(cleaned_tokens))
# print(spacy_tokens[400:500])

In [316]:
# clean_text = remove_stopwords(text)
clean_tokens = clean_up(tokens)
clean_tokens[300:400]

'ormation transmit diﬀerent component man biological system paper informal rigorous introduction main'

In [254]:
# clean_text = remove_symbols(strclean_text)

In [317]:
# Remove commas and \n, and words of len 1, and \n\n

# import string
import re


# print(spacy_tokens[2010:2040])
# print(clean_text[200:300])
# print(len(clean_text))

# Spacy pipeline

In [318]:
nlp = spacy.load('en_core_web_sm')  
spacy_tokens = nlp(clean_tokens)

# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#             token.shape_, token.is_alpha, token.is_stop)
print(len(spacy_tokens))

instruction NOUN
need VERB
arrive ADJ
destination NOUN
left VERB
turn NOUN
indicate VERB
0 NUM
right ADJ
turn NOUN
1 NUM
2 NUM
abd0 NOUN
1 NUM
1 NUM
30 NUM
0 NUM
0 NUM
00 NUM
0 NUM
1 NUM
10 NUM
1 NUM
0 SYM
21 NUM
0 NUM
0 NUM
41 NUM
0 NUM
1 NUM
right hand summarise instruction
arrive destination
left turn


# Stemming - Snowball stemmers

In [319]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer(language='english')

# test_tokens = ['compute', 'computer', 'computed', 'computing']
# document = sp(spacy_tokens[200:300])
id_sequence = map(lambda x: x.orth, [token for token in spacy_tokens[200:300]])
text = map(lambda x: sp.vocab[x].text, [id for id in id_sequence])

for token in text:  
    print(token + ' --> ' + stemmer.stem(token))

nearly --> near
reach --> reach
judicious --> judici
packaging --> packag
encoding --> encod
datum --> datum
2 --> 2
find --> find
route --> rout
bit --> bit
bit --> bit
information --> inform
usually --> usual
measure --> measur
bit --> bit
bit --> bit
information --> inform
allow --> allow
choose --> choos
equally --> equal
probable --> probabl
equiprobable --> equiprob
alternative --> altern
order --> order
understand --> understand
imagine --> imagin
stand --> stand
fork --> fork
road --> road
point --> point
figure --> figur
2 --> 2
want --> want
point --> point
mark --> mark
d. --> d.
fork --> fork
represent --> repres
equiprobable --> equiprob
alternative --> altern
tell --> tell
left --> left
receive --> receiv
bit --> bit
information --> inform
represent --> repres
instruction --> instruct
binary --> binari
digit --> digit
0=left --> 0=left
1=right --> 1=right
binary --> binari
digit --> digit
provide --> provid
bit --> bit
information --> inform
tell --> tell
road --> road
ch

# Lemmatization - Spacy lemmatizer

In [320]:
for word in spacy_tokens[200:220]:  
    pass
    # print(str(word.text) + "-->" +   str(word.lemma_))

spacy_lemmas = []
for word in spacy_tokens:
    spacy_lemmas.append(word.lemma_)
print(spacy_lemmas[200:300])

['nearly', 'reach', 'judicious', 'packaging', 'encode', 'datum', '2', 'find', 'route', 'bit', 'bit', 'information', 'usually', 'measure', 'bit', 'bit', 'information', 'allow', 'choose', 'equally', 'probable', 'equiprobable', 'alternative', 'order', 'understand', 'imagine', 'stand', 'fork', 'road', 'point', 'figure', '2', 'want', 'point', 'mark', 'd.', 'fork', 'represent', 'equiprobable', 'alternative', 'tell', 'left', 'receive', 'bit', 'information', 'represent', 'instruction', 'binary', 'digit', '0=left', '1=right', 'binary', 'digit', 'provide', 'bit', 'information', 'tell', 'road', 'choose', 'imagine', 'come', 'fork', 'point', 'b', 'figure', '2', 'binary', 'digit', '1=right', 'provide', 'bit', 'information', 'allow', 'choose', 'correct', 'road', 'lead', 'c.', 'note', 'c', 'possible', 'interim', 'destination', 'figure', '2', 'traveller', 'know', 'way', 'fork', 'road', 'require', 'bit', 'information', 'correct', 'decision', '0s', '1s', 'right', 'hand', 'summarise']


# Entity detection

In [321]:

for word in spacy_tokens[300:330]:  
    print(word.text,  word.pos_)
for noun in spacy_tokens[300:330].noun_chunks:  
    print(noun.text)


instruction NOUN
need VERB
arrive ADJ
destination NOUN
left VERB
turn NOUN
indicate VERB
0 NUM
right ADJ
turn NOUN
1 NUM
2 NUM
abd0 NOUN
1 NUM
1 NUM
30 NUM
0 NUM
0 NUM
00 NUM
0 NUM
1 NUM
10 NUM
1 NUM
0 SYM
21 NUM
0 NUM
0 NUM
41 NUM
0 NUM
1 NUM
right hand summarise instruction
arrive destination
left turn


In [113]:
spacy_lemmas = nlp(str(spacy_lemmas))
for word in spacy_lemmas[250:300]:  
    print(word.text,  word.pos_)
for noun in spacy_lemmas[250:300].noun_chunks:  
    print(noun.text)

, PUNCT
' PUNCT
v ADP
' PUNCT
, PUNCT
' PUNCT
, PUNCT
' PUNCT
, PUNCT
' PUNCT
\n SPACE
' PUNCT
, PUNCT
' PUNCT
, PUNCT
' PUNCT
, PUNCT
' PUNCT
8 NUM
' NOUN
, PUNCT
' PUNCT
, PUNCT
' PUNCT
, PUNCT
' PUNCT
\n SPACE
' PUNCT
, PUNCT
' PUNCT
, PUNCT
' PUNCT
, PUNCT
' PUNCT
6 NUM
' NOUN
, PUNCT
' PUNCT
, PUNCT
' PUNCT
, PUNCT
' PUNCT
\n SPACE
' PUNCT
, PUNCT
' PUNCT
, PUNCT
' PUNCT
, PUNCT
' PUNCT


# Analyze ngrams

In [324]:
import nltk
from nltk.util import ngrams

def word_grams(words, number):
    s = []
    for ngram in ngrams(words, number):
            s.append(' '.join(str(i) for i in ngram))
    return s

In [330]:
bigrams = word_grams(spacy_tokens, 2)
from collections import Counter
count_grams = Counter(bigrams)
count_grams.most_common(10)

[('information theory', 31),
 ('binary digit', 22),
 ('channel capacity', 20),
 ('log 1', 20),
 ('shannon ’s', 19),
 ('bit information', 14),
 ('variable x', 11),
 ('entropy h(x', 11),
 ('shannon information', 10),
 ('mutual information', 10)]