# Parsing PDFs 

Pull in all of the PDF files and create objects for the text inside each one. 

This also probably should include a spell check

In [1]:
import PyPDF2 
from glob import glob

pdfs = glob('../pdfs/*.pdf') 

In [2]:
pdfs

['../pdfs/Cruz - 2010 - Chapter Six. Expanding The View The Challenges Of.pdf',
 '../pdfs/Cruz - 2010 - Index.pdf',
 '../pdfs/Frederiks and Nagy - 2016 - Religion, migration, and identity methodological .pdf',
 '../pdfs/Cruz - 2010 - Introduction.pdf',
 '../pdfs/Cruz - 2010 - Chapter One. Geographies Of Domestication Mapping.pdf',
 '../pdfs/cruz2010.pdf',
 '../pdfs/Cruz - 2010 - Chapter Two. Frontiers Of Struggle Negotiating Fi.pdf',
 '../pdfs/Cruz - 2010 - Conclusion.pdf',
 '../pdfs/Cruz - 2010 - Chapter Four. Exploring Theological Markers Delor.pdf',
 '../pdfs/Cruz - 2010 - An intercultural theology of migration pilgrims i.pdf',
 "../pdfs/Nguyen and Prior - 2014 - God's people on the move biblical and global pers.pdf",
 '../pdfs/Cruz - 2010 - Chapter Three. Expanding The Boundaries Theologic.pdf',
 '../pdfs/Gods People on the Move 2.pdf',
 '../pdfs/Cruz - 2010 - Chapter Five. A Different Cartography Mapping The.pdf',
 '../pdfs/Izuzquiza - 2011 - Breaking bread notes for a political t

# Preparing Texts 

## Lemmatizing and cleaning 

In [96]:
import nltk 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as STOPWORDS 

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sgoodwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sgoodwin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [82]:
def text_clean(text):
    clean_list = []
    words = nltk.word_tokenize(text)
    for w in words:
        if w not in STOPWORDS:
            clean_list.append(lemmatizer.lemmatize(w))
    return clean_list

In [85]:
text_clean("The striped bats are hanging on their feet for best".lower())

['striped', 'bat', 'hanging', 'foot', 'best']

## Extracting Texts

In [17]:
pdf = open(pdfs[0], 'rb')
pdf_obj = PyPDF2.PdfFileReader(pdf)


In [18]:
print('No. of pages: {}'.format(pdf_obj.numPages))

No. of pages: 35


In [86]:
def pdf_extractor(pdf, corpus_list, text_list):
    '''Extract the text of pdfs and return a dictionary with
    the file name as a key, and the value being a list of the pages
    and the containing texts
    '''
    pdf_file_obj = open(pdf, 'rb')
    pdf_obj = PyPDF2.PdfFileReader(pdf_file_obj)
    for pn in range(0,pdf_obj.numPages):
        page = pdf_obj.getPage(pn)
        text = page.extractText().lower()
        cleaned_list = text_clean(text)
        corpus_list.append(cleaned_list)
        # corpus_list.append(page.extractText())
        text_list.append((pdf, pn))
        # if you want to create a dictionary
        # text_dict.setdefault(pdf, []).append(page.extractText())
    pdf_file_obj.close()
    return corpus_list, text_list

In [88]:
corpus_list = []
text_list = []

for pdf in pdfs:
    corpus_list, text_list = pdf_extractor(pdf, corpus_list, text_list)



# Creating LDA Model 



In [57]:
from gensim import corpora 
from gensim.models.ldamodel import LdaModel 

In [44]:
text = text_dict['../pdfs/Cruz - 2010 - Chapter Six. Expanding The View The Challenges Of.pdf']

In [103]:
def prepare_topic_model(corpus_list):
    corpus_dict = corpora.Dictionary(corpus_list)
    corpus_dict.filter_extremes(no_below=100, no_above=0.5)
    corpus = [corpus_dict.doc2bow(text) for text in corpus_list]
    lda_model = LdaModel(corpus=corpus, 
                        id2word=corpus_dict, num_topics=25,
                        random_state=100, update_every=1,
                        chunksize=100, passes=50,
                        alpha='symmetric', per_word_topics=True)
    return lda_model, corpus, corpus_dict

In [104]:
lda_model, corpus, corpus_dict = prepare_topic_model(corpus_list)

# Visualizing LDA Model

In [110]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [116]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, corpus_dict, mds='mmds')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

 [_prepare.py:257]


In [118]:
vis