# Notebook to experiment with topic model

In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

import nltk

nltk.download('wordnet')
from nltk.corpus import wordnet as wn
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /Users/ellen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ellen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
import load_pdfs, read_pdf, topic_model

[nltk_data] Downloading package wordnet to /Users/ellen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ellen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Part 1 Load PDFs from folder 'papers'

In [11]:
def getFileNames():
    foldername = 'papers'
    file_names = os.listdir(foldername)
    return foldername, file_names

In [12]:
foldername, filenames = load_pdfs.getFileNames()

try:
    for each in filenames:
        print(each)
        print('\n')
except:
    pass

Hybrid Recommender Systems.pdf


Information Theory - Tutorial.pdf


Geometric Understanding of Deep Learning.pdf


1802.05968v2.pdf


Time Series Feature Extraction.pdf


An Introduction to DRL.pdf


Multitask Learning as Multiobjective Optimization.pdf


GANs.pdf


Introduction to Transfer Learning.pdf


Deep CNN Design Patterns.pdf




### Part 2 Retrieve the text from the PDF files

In [15]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, TextConverter
from pdfminer.layout import LAParams
import io

def pdfparser(data):
    fp = open(data, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data = retstr.getvalue()
    return data


In [16]:
text = pdfparser(foldername+'/'+filenames[1])

In [17]:
# print(text)

### Topic model part

In [18]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

import nltk

nltk.download('wordnet')
from nltk.corpus import wordnet as wn
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

from nltk.stem.wordnet import WordNetLemmatizer


def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma


def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)


def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

[nltk_data] Downloading package wordnet to /Users/ellen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ellen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
tokens = topic_model.prepare_text_for_lda(text)
# tokens

In [23]:
tokens2 = topic_model.tokenize(text)
# tokens2

In [24]:
# Calculate frequency distribution
fdist = nltk.FreqDist(tokens)

# Output top 50 words

for word, frequency in fdist.most_common(50):
    print(u'{};{}'.format(word, frequency))

information;133
entropy;84
channel;74
variable;51
shannon;47
noise;47
input;47
output;43
theory;42
value;39
capacity;32
values;31
gaussian;31
signal;30
distribution;29
binary;24
average;24
figure;23
digit;23
possible;23
outcome;23
amount;21
equation;21
represent;20
given;19
frequency;19
communicate;18
equiprobable;18
fourier;18
theorem;17
probability;16
cid:90;16
maximum;16
second;16
h(y|x;16
number;15
power;15
provide;14
component;13
state;13
yield;13
uncertainty;13
introduction;12
surprise;12
m(cid:88;12
coding;12
reduce;12
variance;12
transmit;11
deﬁned;11


# Entity detection

In [27]:
sp = spacy.load('en_core_web_sm')  
spacy_tokens = sp(text)

In [31]:
for word in spacy_tokens[190:200]:  
    print(word.text,  word.pos_)



 SPACE
information NOUN
. PUNCT
Before ADP
Shannon PROPN
’s PROPN
paper NOUN
, PUNCT
information NOUN
had VERB


In [32]:
for noun in spacy_tokens[190:200].noun_chunks:  
    print(noun.text)

information
Shannon’s paper
information


# Stemming - Snowball stemmer

In [41]:

from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer(language='english')

test_tokens = ['compute', 'computer', 'computed', 'computing']
# document = sp(spacy_tokens[200:300])
id_sequence = map(lambda x: x.orth, [token for token in spacy_tokens[200:220]])
text = map(lambda x: sp.vocab[x].text, [id for id in id_sequence])

for token in text:  
    print(token + ' --> ' + stemmer.stem(token))

been --> been
viewed --> view
as --> as
a --> a
kind --> kind
of --> of
poorly --> poor


 --> 


deﬁned --> deﬁn
miasmic --> miasmic
ﬂuid --> ﬂuid
. --> .
But --> but
after --> after
Shannon --> shannon
’s --> ’s
paper --> paper
, --> ,
it --> it
became --> becam


# Lemmatization - Spacy lemmatizer

In [47]:
for word in spacy_tokens[200:220]:  
    print(str(word.text) + "-->" +   str(word.lemma_))


been-->be
viewed-->view
as-->as
a-->a
kind-->kind
of-->of
poorly-->poorly


-->


deﬁned-->deﬁne
miasmic-->miasmic
ﬂuid-->ﬂuid
.-->.
But-->but
after-->after
Shannon-->Shannon
’s-->’s
paper-->paper
,-->,
it-->-PRON-
became-->become


# Analyze ngrams

In [25]:
import nltk
from nltk.util import ngrams

def word_grams(words, number):
    s = []
    for ngram in ngrams(words, number):
            s.append(' '.join(str(i) for i in ngram))
    return s

In [26]:
word_grams(tokens, 2)

['information theory',
 'theory tutorial',
 'tutorial introduction',
 'introduction james',
 'james stone',
 'stone psychology',
 'psychology department',
 'department university',
 'university sheﬃeld',
 'sheﬃeld england',
 'england j.v.stone@sheﬃeld.ac.uk',
 'j.v.stone@sheﬃeld.ac.uk informationtheory',
 'informationtheory jvstone',
 'jvstone v3.tex',
 'v3.tex abstract',
 'abstract shannon',
 'shannon mathematical',
 'mathematical theory',
 'theory communication',
 'communication deﬁnes',
 'deﬁnes fundamental',
 'fundamental limit',
 'limit information',
 'information transmit',
 'transmit diﬀerent',
 'diﬀerent component',
 'component biological',
 'biological system',
 'system paper',
 'paper informal',
 'informal rigorous',
 'rigorous introduction',
 'introduction idea',
 'idea implicit',
 'implicit shannon',
 'shannon theory',
 'theory annotate',
 'annotate reading',
 'reading provide',
 'provide reading',
 'reading introduction',
 'introduction claude',
 'claude shannon',
 'shanno