# Notebook to experiment with topic model

In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

import nltk

nltk.download('wordnet')
from nltk.corpus import wordnet as wn
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /Users/ellen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ellen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
import load_pdfs, read_pdf, topic_model

[nltk_data] Downloading package wordnet to /Users/ellen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ellen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Part 1 Load PDFs from folder 'papers'

In [2]:
def getFileNames():
    foldername = 'papers'
    file_names = os.listdir(foldername)
    return foldername, file_names

In [6]:
foldername, filenames = load_pdfs.getFileNames()

try:
    for each in filenames:
        print(each)
        print('\n')
except:
    pass

Hybrid Recommender Systems.pdf


Information Theory - Tutorial.pdf


Geometric Understanding of Deep Learning.pdf


1802.05968v2.pdf


Time Series Feature Extraction.pdf


An Introduction to DRL.pdf


Multitask Learning as Multiobjective Optimization.pdf


GANs.pdf


Introduction to Transfer Learning.pdf


Deep CNN Design Patterns.pdf




### Part 2 Retrieve the text from the PDF files

In [7]:
import io

from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage


def extract_text_from_pdf(pdf_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.BytesIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh,
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)

        text = fake_file_handle.getvalue()

    # close open handles
    converter.close()
    fake_file_handle.close()

    if text:
        return text

In [18]:
text = read_pdf.extract_text_from_pdf(foldername+'/'+filenames[1])

In [19]:
print(text)

b'InformationTheory:ATutorialIntroductionJamesVStone,PsychologyDepartment,UniversityofShe\xef\xac\x83eld,England.j.v.stone@she\xef\xac\x83eld.ac.ukFile:mainInformationTheoryJVStonev3.texAbstractShannon\xe2\x80\x99smathematicaltheoryofcommunicationde\xef\xac\x81nesfundamentallimitsonhowmuchinformationcanbetransmittedbetweenthedi\xef\xac\x80erentcomponentsofanyman-madeorbiologicalsystem.ThispaperisaninformalbutrigorousintroductiontothemainideasimplicitinShannon\xe2\x80\x99stheory.Anannotatedreadinglistisprovidedforfurtherreading.1IntroductionIn1948,ClaudeShannonpublishedapapercalledAMathematicalTheoryofCommunication[1].Thispaperheraldedatransformationinourunderstandingofinformation.BeforeShannon\xe2\x80\x99spaper,informationhadbeenviewedasakindofpoorlyde\xef\xac\x81nedmiasmic\xef\xac\x82uid.ButafterShannon\xe2\x80\x99spaper,itbecameapparentthatinformationisawell-de\xef\xac\x81nedand,aboveall,measurablequantity.Indeed,asnotedbyShannon,Abasicideaininformationtheoryisthatinformationcanbetre

### Topic model part

In [20]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

import nltk

nltk.download('wordnet')
from nltk.corpus import wordnet as wn
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

from nltk.stem.wordnet import WordNetLemmatizer


def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma


def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)


def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens


def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

[nltk_data] Downloading package wordnet to /Users/ellen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ellen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
tokens = topic_model.prepare_text_for_lda(text.decode("utf-8"))
tokens

['informationtheory',
 'atutorialintroductionjamesvstone',
 'psychologydepartment',
 'universityofsheﬃeld',
 'england.j.v.stone@sheﬃeld.ac.ukfile',
 'maininformationtheoryjvstonev3.texabstractshannon’smathematicaltheoryofcommunicationdeﬁnesfundamentallimitsonhowmuchinformationcanbetransmittedbetweenthediﬀerentcomponentsofanyman',
 'madeorbiologicalsystem',
 'thispaperisaninformalbutrigorousintroductiontothemainideasimplicitinshannon’stheory',
 'anannotatedreadinglistisprovidedforfurtherreading.1introductionin1948,claudeshannonpublishedapapercalledamathematicaltheoryofcommunication[1].thispaperheraldedatransformationinourunderstandingofinformation',
 'beforeshannon’spaper',
 'informationhadbeenviewedasakindofpoorlydeﬁnedmiasmicﬂuid',
 'butaftershannon’spaper',
 'itbecameapparentthatinformationisawell',
 'deﬁnedand',
 'aboveall',
 'measurablequantity',
 'indeed',
 'asnotedbyshannon',
 'abasicideaininformationtheoryisthatinformationcanbetreatedverymuchlikeaphysicalquantity',
 'suchasmasso

In [None]:
tokens2 = topic_model.tokenize(text.decode("utf-8"))
tokens2