# Parsing PDFs 

Pull in all of the PDF files and create objects for the text inside each one. 

This also probably should include a spell check

[One of the sources I am using for the topic modeling](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/)
[This is a good post on Lemmatizing in python](https://www.machinelearningplus.com/nlp/lemmatization-examples-python/)

In [1]:
import string 

In [2]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [3]:
import PyPDF2 
from glob import glob

pdfs = glob('../pdfs/*.pdf') 

In [4]:
pdfs

['../pdfs/Cruz - 2010 - Chapter Six. Expanding The View The Challenges Of.pdf',
 '../pdfs/Cruz - 2010 - Index.pdf',
 '../pdfs/Frederiks and Nagy - 2016 - Religion, migration, and identity methodological .pdf',
 '../pdfs/Cruz - 2010 - Introduction.pdf',
 '../pdfs/Cruz - 2010 - Chapter One. Geographies Of Domestication Mapping.pdf',
 '../pdfs/cruz2010.pdf',
 '../pdfs/Cruz - 2010 - Chapter Two. Frontiers Of Struggle Negotiating Fi.pdf',
 '../pdfs/Cruz - 2010 - Conclusion.pdf',
 '../pdfs/Cruz - 2010 - Chapter Four. Exploring Theological Markers Delor.pdf',
 '../pdfs/Cruz - 2010 - An intercultural theology of migration pilgrims i.pdf',
 "../pdfs/Nguyen and Prior - 2014 - God's people on the move biblical and global pers.pdf",
 '../pdfs/Cruz - 2010 - Chapter Three. Expanding The Boundaries Theologic.pdf',
 '../pdfs/Gods People on the Move 2.pdf',
 '../pdfs/Cruz - 2010 - Chapter Five. A Different Cartography Mapping The.pdf',
 '../pdfs/Izuzquiza - 2011 - Breaking bread notes for a political t

# Preparing Texts 

## Lemmatizing and cleaning 

In [5]:
import string
import nltk 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as STOPWORDS 

PUNCDIG_TRANSLATOR = str.maketrans('', '', string.punctuation+string.digits)

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sgoodwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sgoodwin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def text_clean(text):
    clean_list = []
    words = nltk.word_tokenize(text)
    for w in words:
        if w not in STOPWORDS:
            w = w.translate(PUNCDIG_TRANSLATOR)
            clean_list.append(lemmatizer.lemmatize(w))
    return clean_list

In [7]:
text_clean("The striped bats are hanging on their feet3 for best.".lower())

['striped', 'bat', 'hanging', 'foot', 'best', '']

## Extracting Texts

In [8]:
pdf = open(pdfs[0], 'rb')
pdf_obj = PyPDF2.PdfFileReader(pdf)


In [9]:
print('No. of pages: {}'.format(pdf_obj.numPages))

No. of pages: 35


In [10]:
def pdf_extractor(pdf, corpus_list, text_list):
    '''Extract the text of pdfs and return a dictionary with
    the file name as a key, and the value being a list of the pages
    and the containing texts
    '''
    pdf_file_obj = open(pdf, 'rb')
    pdf_obj = PyPDF2.PdfFileReader(pdf_file_obj)
    for pn in range(0,pdf_obj.numPages):
        page = pdf_obj.getPage(pn)
        text = page.extractText().lower()
        cleaned_list = text_clean(text)
        corpus_list.append(cleaned_list)
        # corpus_list.append(page.extractText())
        text_list.append((pdf, pn))
        # if you want to create a dictionary
        # text_dict.setdefault(pdf, []).append(page.extractText())
    pdf_file_obj.close()
    return corpus_list, text_list

In [11]:
corpus_list = []
text_list = []

for pdf in pdfs:
    corpus_list, text_list = pdf_extractor(pdf, corpus_list, text_list)



# Creating LDA Model 



In [12]:
from gensim import corpora 
from gensim.models.ldamodel import LdaModel 

In [13]:
def prepare_topic_model(corpus_list):
    corpus_dict = corpora.Dictionary(corpus_list)
    corpus_dict.filter_extremes(no_below=100, no_above=0.5)
    corpus = [corpus_dict.doc2bow(text) for text in corpus_list]
    lda_model = LdaModel(corpus=corpus, 
                        id2word=corpus_dict, num_topics=25,
                        random_state=100, update_every=1,
                        chunksize=100, passes=50,
                        alpha='symmetric', per_word_topics=True)
    return lda_model, corpus, corpus_dict

In [14]:
lda_model, corpus, corpus_dict = prepare_topic_model(corpus_list)

# Visualizing LDA Model

In [15]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [38]:
pyLDAvis.enable_notebook()
# The sort_topics=False makes the topic model numbers agree [+1] with the topic model from gensim
# Gensim's topic numbers' are zero indexed, and the vis index is 1 indexed
vis = pyLDAvis.gensim.prepare(lda_model, corpus, corpus_dict, sort_topics=False, mds='mmds')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

 [_prepare.py:257]


In [43]:
vis

In [44]:
pyLDAvis.save_html(vis, 'PrelimTopicModel.html')

# Find the Dominant Document For Each Topic 


In [18]:
import pandas as pd

In [19]:
# this creates a pandas DataFrame that orders all of the topics and shows the dominant topic for each document
def format_topics_sent(ldamodel, corpus, texts):
    sent_topics_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: x[1], reverse=True)
        
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_topic', 'Perc_Contrib', 'Topic_Keywords']
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df

## Exploring the Dominant Topic Models

In [20]:
def format_topics_sent_gen(ldamodel, corpus, texts):
    for i, row in enumerate(ldamodel[corpus]):
        yield row

In [21]:
genny = format_topics_sent_gen(lda_model, corpus, corpus_list)

In [22]:
row = next(genny)

In [23]:
len(row)

3

In [42]:
lda_model.show_topic(13)

[('”', 0.2966608),
 ('“', 0.2558419),
 ('’', 0.038733605),
 ('–', 0.03453399),
 ('vol', 0.03118087),
 ('e', 0.030572623),
 ('dhs', 0.028041013),
 ('s', 0.021423323),
 ('tulud', 0.015217608),
 ('cruz', 0.014499779)]

In [25]:
sent_topics_df = format_topics_sent(lda_model, corpus, text_list)

In [26]:
grpd_df = sent_topics_df.groupby('Dominant_topic')

In [27]:
# This code creates a pandas DataFrame that shows which document is exemplified by which topic
sent_topics_df = pd.DataFrame()

for i, grp in grpd_df:
    sent_topics_df = pd.concat([sent_topics_df, grp.sort_values(['Perc_Contrib'], ascending=[0]).head(1)], axis=0)

sent_topics_df.reset_index(drop=True, inplace=True)
sent_topics_df.columns = ['Topic_Num', 'Topic_Perc_Contrib', 'Keywords', 'Text']
sent_topics_df

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.5008,"migration, context, migrant, study, experience...","(../pdfs/Frederiks and Nagy - 2016 - Religion,..."
1,1.0,0.3923,"immigrant, american, case, service, sense, gro...","(../pdfs/Frederiks and Nagy - 2016 - Religion,..."
2,2.0,0.53,"new, ed, press, york, study, mission, book, ch...","(../pdfs/Frederiks and Nagy - 2016 - Religion,..."
3,3.0,0.7007,"–, people, religion, dhs, order, come, life, e...","(../pdfs/Cruz - 2010 - Index.pdf, 2)"
4,4.0,0.4219,"’, s, god, experience, e, challenge, g, cruz, ...",(../pdfs/Cruz - 2010 - Chapter Four. Exploring...
5,5.0,0.3031,"economic, family, home, role, migration, er, t...","(../pdfs/Frederiks and Nagy - 2016 - Religion,..."
6,6.0,0.808,"nagy, dorottya, frederiks, martha, christian, ...","(../pdfs/Frederiks and Nagy - 2016 - Religion,..."
7,7.0,0.4001,"right, power, human, political, world, come, s...","(../pdfs/Frederiks and Nagy - 2016 - Religion,..."
8,8.0,0.4924,"religion, identity, religious, culture, cultur...",(../pdfs/Cruz - 2010 - Chapter Six. Expanding ...
9,9.0,0.3283,"struggle, filipino, philippine, eology, book, ...",(../pdfs/Cruz - 2010 - Preliminary Material.pd...


In [28]:
sent_topics_df.iloc[0]['Text']

('../pdfs/Frederiks and Nagy - 2016 - Religion, migration, and identity methodological .pdf',
 63)

In [29]:
lda_model.show_topic(0)

[('migration', 0.5920657),
 ('context', 0.12714523),
 ('migrant', 0.0677818),
 ('study', 0.05909739),
 ('experience', 0.037804633),
 ('tion', 0.024900865),
 ('community', 0.021318907),
 ('challenge', 0.017123718),
 ('human', 0.014246021),
 ('culture', 0.012124161)]