# Notebook to experiment with topic model

# Imports

In [72]:
### Local functions
import topic_model

### Libraries
import os
import re
import string

## Spacy
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

## nltk
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package wordnet to /Users/ellen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ellen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Part 1 Load PDFs from folder 'papers'

In [15]:
def get_file_names(folder='papers'):
    """
    
    :param folder: string, name of folder where the papers are contained
    :return: string foldername, list filenames
    """
    if not os.path.exists(folder):
        os.makedirs(folder)
    file_names = os.listdir(folder)
    return folder, file_names


foldername, filenames = getFileNames('papers')

filenames


['Hybrid Recommender Systems.pdf',
 'Information Theory - Tutorial.pdf',
 'Geometric Understanding of Deep Learning.pdf',
 '1802.05968v2.pdf',
 'Time Series Feature Extraction.pdf',
 'An Introduction to DRL.pdf',
 'Multitask Learning as Multiobjective Optimization.pdf',
 'GANs.pdf',
 'Introduction to Transfer Learning.pdf',
 'Deep CNN Design Patterns.pdf']

# Part 2 Retrieve the text from the PDF files

We use pdf miner to retrieve the text from the filenames in our 'papers' folder

In [23]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, TextConverter
from pdfminer.layout import LAParams
import io

# TODO check if this works for multiple papers too
# TODO e.g. is it possible to create a list of tokens for each? 
# TODO what is the best way to do this?
# TODO is this already sth that should be returned in the end?


def pdfparser(filename):
    """
    
    :param data: filename (string)
    :return: text data (string), the complete text in the document(s)
    """
    fp = open(filename, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data = retstr.getvalue()
    return data


In [178]:
text = pdfparser(foldername+'/'+filenames[0])  # example, just for the first paper

# TODO cleansing operations directly on raw text

text= text.replace("\n"," ")
text = re.sub(r'\d+', '', text)
text = text.translate(str.maketrans("","", string.punctuation))
text = text.strip()
text = re.sub(r'\b\w{1,3}\b', '', text)
# text = [tok for tok in text if tok not in STOPLIST and tok not in punctuations]
# text = ' '.join(text)

In [179]:
text[0:600]

'Hybrid Recommender Systems  Systematic Literature Review  Department  Control  Computer Engineering Politecnico  Torino Corso Duca degli Abruzzi    Torino  Erion ¸  Maurizio Morisio  Abstract Recommender systems  software tools used  generate  provide suggestions  items  other entities   users  exploiting various strategies Hybrid recommender systems combine   more recommendation strategies  different ways  beneﬁt from their  plementary advantages This systematic literature review presents  state     hybrid recommender systems   last decade    ﬁrst quantitative review work completely  cused  h'

# Part 3 Quick data exploration: check of top x words with nltk freqdist

In [180]:
tokens = topic_model.prepare_text_for_lda(text)

In [181]:
# Calculate frequency distribution
fdist = nltk.FreqDist(tokens)

# Output top 20 words

for word, frequency in fdist.most_common(20):
    print(u'{};{}'.format(word, frequency))

hybrid;191
system;181
recommendation;167
study;152
recommender;123
author;78
technique;75
problem;68
item;61
conference;59
different;58
domain;58
base;57
accuracy;53
international;53
user;52
using;47
result;47
evaluation;46
information;46


# Part 4 Preprocessing: remove punctuation, newline chars, stopwords etc

In [182]:
nlp = spacy.load('en')

# text is a string of all the separate tokens
print("The paper contains " + str(len(text)) + " tokens in total")
print("Here are the tokens at position 400-420: \n\n '" + str(text[400:420]) + " ' \n")

# create spacy tokens
tokens = nlp(text)
print("After conversion to " + str(type(tokens)) + " the paper contains " + str(len(tokens)) + " tokens")
print("Here are the tokens at position 400-420:  \n\n'" + str(tokens[400:420]) + "'")

The paper contains 96995 tokens in total
Here are the tokens at position 400-420: 

 'rent ways  beneﬁt fr ' 



After conversion to <class 'spacy.tokens.doc.Doc'> the paper contains 15848 tokens
Here are the tokens at position 400-420:  

'problem Recommender Systems    such tools that emerged     They  commonly deﬁned  software tools  techniques used  '


## 4a) Remove stopwords and punctuations

In [183]:
from nltk.corpus import stopwords
STOPLIST = set(stopwords.words('english'))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]
punctuations = string.punctuation


def clean_up(tokens):
    clean_tokens = []
    for each in tokens:
        if each.is_stop == False:
            if each.is_punct == False:
                clean_tokens.append(each)
    final_tokens = [tok.lemma_.lower().strip() for tok in clean_tokens if tok.lemma_ != '-PRON-']
    final_tokens = [tok for tok in final_tokens if tok not in STOPLIST and tok not in punctuations]
    final_tokens = ' '.join(final_tokens)
    return final_tokens

def further_clean_up(tokens):
    tokens = re.sub(r'\d+', '', tokens)
    tokens = tokens.translate(str.maketrans("","", string.punctuation))
    tokens = tokens.strip()
    tokens = re.sub(r'\b\w{1,3}\b', '', tokens)
    return tokens


# apply the clean_up functions to the spacy nlp tokens
clean_tokens = clean_up(tokens)
clean_tokens = further_clean_up(clean_tokens)
clean_tokens = clean_tokens.strip()
clean_tokens[400:800]


'w present state hybrid recommender system decade ﬁrst quantitative review work completely cuse hybrid recommender address relevant problem consider present associate datum mining recommendation technique overcome explore hybridization class hybrid recommender belong application domain evaluation process propose future research direction base ﬁnding study combine collaborative ﬁltering technique we'

In [184]:
# unfortunately this returns a string again so we need to apply nlp again 

final_tokens = nlp(clean_tokens)
final_tokens[400:600]

collect demo graphic information need online privacy concern limit utilization combine recommender reinforce technique well quality knowledgebased filtering  knowledge user item reason item meet user requirement generate recommendation accordingly special type kbfs constraintbase capable recommend complex item rarely   house manifest important constrain user price possible successfully domain item usersystem interaction datum available people rarely house early recommender system tapestry manual mail system ﬁrst computerize prototype apply collaborative ﬁltering approach emerge grouplens recommendation engine ﬁnde news article author present detailed analysis evaluation bellcore video recommender algorithm implementation embed mosaic browser interface ringo taste similarity provide personalized music recommendation prototype like newsweeder infofinder recommend news document base item attribute late important commercial prototype come amazoncom recommender popular researcher start comb

## 4b) POS tagging

In [185]:
# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#             token.shape_, token.is_alpha, token.is_stop)

for token in final_tokens:
    # print(token.pos_)
    pass

for word in final_tokens[300:330]:  
    print(word.text,  word.pos_)

for noun in final_tokens[300:330].noun_chunks:  
    # print(noun.text)
    pass

basic ADJ
assumption NOUN
people NOUN
similar ADJ
taste NOUN
past ADP
similar ADJ
taste NOUN
future ADJ
early ADJ
deﬁnition NOUN
collaboration NOUN
people NOUN
help VERB
perform VERB
ﬁltere NOUN
record NOUN
reaction NOUN
document NOUN
read VERB
approach NOUN
  SPACE
rating NOUN
form NOUN
user NOUN
generate VERB
feedback NOUN
spot NOUN
taste NOUN
commonality NOUN


## 4c) Lemmatization

Now that we are finished with the data cleaning, we can apply lemmatization to the final tokens

Lemmatization is sufficient for our use case, we do not apply stemming

In [186]:
# TODO remove empty values

def lemmatizer(tokens):
    spacy_lemmas = []
    for word in tokens:
        spacy_lemmas.append(word.lemma_)
    return spacy_lemmas
 
   
lemmatized_tokens = lemmatizer(final_tokens)

print(lemmatized_tokens[600:800])

['form', 'hybrid', 'present', 'hybrid', 'prototype', 'fall', 'hybridization', 'class', 'taxonomy', 'early', 'exploratory', 'work', 'experiment', 'combine', 'personalize', 'agent', 'opinion', 'community', 'member', 'framework', 'conduct', 'conclude', 'combination', 'produce', 'highquality', 'recommendation', 'good', 'result', 'achieve', 'large', 'datum', 'user', 'community', 'review', 'work', 'generic', 'address', 'general', 'focus', 'type', 'reﬂect', 'increase', ' ', 'ﬁeld', 'quantitative', 'term', 'author', 'perform', 'review', 'work', 'journal', 'conference', 'publication', 'peak', 'publication', 'period', 'work', 'consider', 'onethird', 'analyze', 'period', 'emphasize', 'fact', 'current', 'hybrid', 'incorporate', 'location', 'information', 'exist', 'recom', 'mendation', 'algorithm', 'highlight', 'proper', 'combination', 'exist', 'method', 'different', 'form', 'datum', 'evaluate', 'characteristic', 'diversity', 'novelty', 'accuracy', 'future', 'trend', 'author', 'review', 'recommende

In [187]:
# from nltk.stem.snowball import SnowballStemmer
# 
# stemmer = SnowballStemmer(language='english')
# 
# # test_tokens = ['compute', 'computer', 'computed', 'computing']
# # document = sp(spacy_tokens[200:300])
# id_sequence = map(lam'bda x: x.orth, [token for token in spacy_tokens[200:300]])
# text = map(lambda x: sp.vocab[x].text, [id for id in id_sequence])
# 
# for token in text:  
#     print(token + ' --> ' + stemmer.stem(token))

# 5) Analyze ngrams

In [188]:
import nltk
from nltk.util import ngrams

def word_grams(words, number):
    s = []
    for ngram in ngrams(words, number):
            s.append(' '.join(str(i) for i in ngram))
    return s

In [190]:
bigrams = word_grams(lemmatized_tokens, 2)
from collections import Counter
count_grams = Counter(bigrams)
count_grams.most_common(20)

[('recommender system', 89),
 ('hybrid recommender', 34),
 ('international conference', 30),
 ('ieee ieee', 23),
 ('ieee international', 20),
 ('datum sparsity', 19),
 ('hybrid recommendation', 19),
 ('datum mining', 18),
 ('future work', 18),
 ('find study', 18),
 ('recommendation strategy', 17),
 ('application domain', 17),
 ('recommendation technique', 16),
 ('user proﬁle', 16),
 ('research question', 16),
 ('hybrid approach', 15),
 ('hybridization class', 14),
 ('science direct', 14),
 ('association rule', 14),
 ('international journal', 14)]

# 6) Creating a model to compare similarities of documents