# Parsing PDFs into Bigrams


### Sources
For Tika, especially: https://github.com/chrismattmann/tika-python

### Jupyter Notebook Configuration
You may have to change the data rate limits in your jupyter config file (see readme in this repository)

In [179]:
# required for parsing pdf. Make sure you have java installed.  
!pip install tika



In [338]:
from collections import Counter
import csv
import pandas as pd
import numpy as np
import os
import re
import pickle
from tika import parser
import time

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\carac\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\carac\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [604]:
def determine_year(pdf_filename):
    searchObj = re.search(r'20[0-9]{2}|19[0-9]{2}', pdf_filename, re.M|re.I)
    if searchObj:
        year = searchObj.group()
    
    return int(year)

In [605]:
def determine_org(pdf_filename):
    searchObj = re.search(r'([A-Za-z\s_]+)', pdf_filename)
    if searchObj:
        org = searchObj.group()
        
    return str(org)

In [607]:
%%time

# creating empty lists to stuff data into
output_list = []
year_list = []
org_list = []

# converts PDFs to .txt files in the directory "data/" and saves the output to the "txt" directory
# identifies the org name & year using the functions from above
# stuffs the text from the parsed pdf & this info into output_list

path = "data/"
dirs = os.listdir(path)

for each_pdf in dirs:
#     print(each_pdf)
    raw = parser.from_file('data/{}'.format(each_pdf))
        
    with open('txt/{}.txt'.format(str(each_pdf)), 'wb') as f:
        f.write(raw['content'].encode("utf-8"))
        try:
            text_year = determine_year(each_pdf)
        except:
            str('NO YEAR')
        
        try:
            text_org = determine_org(each_pdf)
        except:
            str('NO ORG')
        
        year_list.append(text_year)
        org_list.append(text_org)
        
        text = raw['content']
        output_list.append([text, text_year, text_org])
        test_list = output_list[0:6]

Wall time: 1min 32s


In [608]:
# find out how many pdfs are in play
print(len(test_list))

6


In [609]:
# %%time

BIGRAM = False
TRIGRAM = True

# #works
# def preprocess(pdfs):

#     lowered = pdfs.lower()
#     fix_hyphen = re.sub(r'[\-]\W+', '', str(lowered))
#     whitespace = re.sub(r'[\W]+', ' ', str(fix_hyphen))
    
#     tokenizer = RegexpTokenizer(r'[A-z]{4,}')
#     additional_list = ['also', 'ndr', 'red', 'crescent', 'cross', 'world', 'disaster', 'report', 'chap', 'page']
#     tokenized = [word for word in tokenizer.tokenize(whitespace) if word not in additional_list]

#     lemmatizer = WordNetLemmatizer()
#     lemmed = [lemmatizer.lemmatize(word) for word in tokenized if word not in stopwords.words('english')]
    
#     bigram = list(nltk.bigrams(lemmed))

#     return bigram    

#trying
def preprocess(pdfs):

    lowered = pdfs.lower()
    fix_hyphen = re.sub(r'[\-]\W+', '', str(lowered))
    whitespace = re.sub(r'[\W]+', ' ', str(fix_hyphen))
    
    tokenizer = RegexpTokenizer(r'[A-z]{4,}')
    additional_list = ['also', 'ndr', 'red', 'crescent', 'cross', 'world', 'disaster', 'report', 'chap', 'page']
    tokenized = [word for word in tokenizer.tokenize(whitespace) if word not in additional_list]

    lemmatizer = WordNetLemmatizer()
    lemmed = [lemmatizer.lemmatize(word) for word in tokenized if word not in stopwords.words('english')]
    
    if BIGRAM:
        bigram = list(nltk.bigrams(lemmed))
        return bigram   
    elif TRIGRAM:
        trigram = list(nltk.trigrams(lemmed))
        return trigram
    else:
        return lemmed

In [610]:
#run the function

for i in test_list:
    i[0] = preprocess(i[0])
    print (i[0])




[('international', 'federation', 'society'), ('federation', 'society', 'annual'), ('society', 'annual', 'ifrc'), ('annual', 'ifrc', 'saving'), ('ifrc', 'saving', 'life'), ('saving', 'life', 'changing'), ('life', 'changing', 'mind'), ('changing', 'mind', 'international'), ('mind', 'international', 'federation'), ('international', 'federation', 'society'), ('federation', 'society', 'ifrc'), ('society', 'ifrc', 'largest'), ('ifrc', 'largest', 'volunteerbased'), ('largest', 'volunteerbased', 'humanitarian'), ('volunteerbased', 'humanitarian', 'network'), ('humanitarian', 'network', 'together'), ('network', 'together', 'member'), ('together', 'member', 'national'), ('member', 'national', 'society'), ('national', 'society', 'worldwide'), ('society', 'worldwide', 'reach'), ('worldwide', 'reach', 'million'), ('reach', 'million', 'people'), ('million', 'people', 'annually'), ('people', 'annually', 'long'), ('annually', 'long', 'term'), ('long', 'term', 'service'), ('term', 'service', 'developme

[('annual', 'crisis', 'beginning'), ('crisis', 'beginning', 'neal'), ('beginning', 'neal', 'keny'), ('neal', 'keny', 'guyer'), ('keny', 'guyer', 'chief'), ('guyer', 'chief', 'executive'), ('chief', 'executive', 'officer'), ('executive', 'officer', 'linda'), ('officer', 'linda', 'mason'), ('linda', 'mason', 'chair'), ('mason', 'chair', 'board'), ('chair', 'board', 'director'), ('board', 'director', 'message'), ('director', 'message', 'leadership'), ('message', 'leadership', 'team'), ('leadership', 'team', 'dear'), ('team', 'dear', 'friend'), ('dear', 'friend', 'mercy'), ('friend', 'mercy', 'corp'), ('mercy', 'corp', 'thanks'), ('corp', 'thanks', 'generosity'), ('thanks', 'generosity', 'mercy'), ('generosity', 'mercy', 'corp'), ('mercy', 'corp', 'helped'), ('corp', 'helped', 'people'), ('helped', 'people', 'toughest'), ('people', 'toughest', 'place'), ('toughest', 'place', 'survive'), ('place', 'survive', 'recover'), ('survive', 'recover', 'become'), ('recover', 'become', 'self'), ('beco

[('annual', 'quick', 'reaction'), ('quick', 'reaction', 'mercy'), ('reaction', 'mercy', 'corp'), ('mercy', 'corp', 'haiti'), ('corp', 'haiti', 'japan'), ('haiti', 'japan', 'sudan'), ('japan', 'sudan', 'area'), ('sudan', 'area', 'possible'), ('area', 'possible', 'folk'), ('possible', 'folk', 'like'), ('folk', 'like', 'proud'), ('like', 'proud', 'terrence'), ('proud', 'terrence', 'north'), ('terrence', 'north', 'carolina'), ('north', 'carolina', 'trust'), ('carolina', 'trust', 'mercy'), ('trust', 'mercy', 'corp'), ('mercy', 'corp', 'help'), ('corp', 'help', 'people'), ('help', 'people', 'strike'), ('people', 'strike', 'june'), ('strike', 'june', 'maine'), ('june', 'maine', 'none'), ('maine', 'none', 'work'), ('none', 'work', 'would'), ('work', 'would', 'possible'), ('would', 'possible', 'without'), ('possible', 'without', 'generosity'), ('without', 'generosity', 'thank'), ('generosity', 'thank', 'partnering'), ('thank', 'partnering', 'mercy'), ('partnering', 'mercy', 'corp'), ('mercy', '





In [611]:
# def freq(ngrams):
#     fdist = nltk.FreqDist(ngrams)
#     freq_list = list(fdist.items())
#     print (freq_list)

def freq(ngrams):
    fdist = nltk.FreqDist(ngrams)
    ngrams = list(fdist.items())
    return (ngrams)


In [612]:
for i in test_list:
    i[0] = freq(i[0])
    print (i[0])

[(('international', 'federation', 'society'), 37), (('federation', 'society', 'annual'), 30), (('society', 'annual', 'ifrc'), 2), (('annual', 'ifrc', 'saving'), 1), (('ifrc', 'saving', 'life'), 2), (('saving', 'life', 'changing'), 3), (('life', 'changing', 'mind'), 3), (('changing', 'mind', 'international'), 1), (('mind', 'international', 'federation'), 1), (('federation', 'society', 'ifrc'), 2), (('society', 'ifrc', 'largest'), 1), (('ifrc', 'largest', 'volunteerbased'), 1), (('largest', 'volunteerbased', 'humanitarian'), 1), (('volunteerbased', 'humanitarian', 'network'), 1), (('humanitarian', 'network', 'together'), 1), (('network', 'together', 'member'), 1), (('together', 'member', 'national'), 1), (('member', 'national', 'society'), 3), (('national', 'society', 'worldwide'), 2), (('society', 'worldwide', 'reach'), 1), (('worldwide', 'reach', 'million'), 1), (('reach', 'million', 'people'), 2), (('million', 'people', 'annually'), 1), (('people', 'annually', 'long'), 1), (('annually



In [503]:
# t2 = list(test_list)
# t3 = list(test_list)
# t4 = list(test_list)
# t5 = list(test_list)
# t6 = list(test_list)
# t7 = list(test_list)

In [547]:
#convert tuples to list

# for i in range(len(t2[0][0])):
#     t2[0][0][i] = list(t2[0][0][i])

# for i in range(len(t2[0][0])):
#     t2[0][0][i][0] = list(t2[0][0][i][0])

# print(test_list)


In [555]:
# print(t2[0][0][3][1])

1


In [591]:
# print(t4)

[[[[['international', 'federation', 'society'], 45], [['federation', 'society', 'annual'], 31], [['million', 'swiss', 'franc'], 31]], 2012, 'IFRC Annual Report ']]


In [613]:
set_freq = 10

# works?
# print(list(filter(lambda x: (x[0], (x[1]>=set_freq)),test)))

# from an old committed version on github
# list(filter(lambda x: x[1]>=set_freq,t2))

test_list[0][0] = list(filter(lambda x: x[1]>=set_freq,test_list[0][0]))






In [617]:
print(test_list[3])

[[(('annual', 'quick', 'reaction'), 1), (('quick', 'reaction', 'mercy'), 1), (('reaction', 'mercy', 'corp'), 1), (('mercy', 'corp', 'haiti'), 1), (('corp', 'haiti', 'japan'), 1), (('haiti', 'japan', 'sudan'), 1), (('japan', 'sudan', 'area'), 1), (('sudan', 'area', 'possible'), 1), (('area', 'possible', 'folk'), 1), (('possible', 'folk', 'like'), 1), (('folk', 'like', 'proud'), 1), (('like', 'proud', 'terrence'), 1), (('proud', 'terrence', 'north'), 1), (('terrence', 'north', 'carolina'), 1), (('north', 'carolina', 'trust'), 1), (('carolina', 'trust', 'mercy'), 1), (('trust', 'mercy', 'corp'), 1), (('mercy', 'corp', 'help'), 1), (('corp', 'help', 'people'), 1), (('help', 'people', 'strike'), 1), (('people', 'strike', 'june'), 1), (('strike', 'june', 'maine'), 1), (('june', 'maine', 'none'), 1), (('maine', 'none', 'work'), 1), (('none', 'work', 'would'), 1), (('work', 'would', 'possible'), 1), (('would', 'possible', 'without'), 1), (('possible', 'without', 'generosity'), 1), (('without',

In [614]:
#create csv based on whether or not you've created bigrams or trigrams

if BIGRAM:
    filename = "pdf_bigrams.csv"

    with open(filename, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerows(test_list)
        print('done writing pdf_bigrams.csv')
        
elif TRIGRAM:
    filename = "pdf_trigrams.csv"

    with open(filename, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerows(test_list)
        print('done writing pdf_trigrams.csv')
        
else: 
    print('did not print')

done writing pdf_trigrams.csv


In [None]:
##BAD RECYCLE

In [174]:
%%time

BIGRAMS = True
TRIGRAMS = True

for i in output_list:
    words = str(i[0])
      
#     lowered = words.lower()
    lowered = [[x.lower() for x in line] for line in words]
    fix_hyphen = re.sub(r'[\-]\W+', '', str(lowered))
    whitespace = re.sub(r'[\W]+', ' ', str(fix_hyphen))
    tokenizer = RegexpTokenizer(r'[A-z]{4,}')
    additional_list = ['also', 'ndr', 'red', 'crescent', 'cross', 'world', 'disaster', 'report', 'chap', 'page']
    tokenized = [word for word in tokenizer.tokenize(whitespace) if word not in additional_list]

    lemmatizer = WordNetLemmatizer()
    lemmed = [lemmatizer.lemmatize(word) for word in tokenized if word not in stopwords.words('english')]
    lemmed = list(lemmed)
   
#     bgs = list(nltk.bigrams(lemmed))
    bgs = nltk.bigrams(lemmed)
    i[0] = bgs

# lemmatizer = WordNetLemmatizer()
# lemmed = [lemmatizer.lemmatize(word) for word in tokenized if word not in stopwords.words('english')]

# fix_hyphen = re.sub(r'[\-]\W+', '', lowered)
# whitespace = re.sub(r'[\W]+', ' ', fix_hyphen)

# tokenizer = RegexpTokenizer(r'[A-z]{4,}')
# additional_list = ['also', 'ndr', 'red', 'crescent', 'cross', 'world', 'disaster', 'report', 'chap', 'page']
# tokenized = [word for word in tokenizer.tokenize(whitespace) if word not in additional_list]
    
# lemmatizer = WordNetLemmatizer()
# lemmed = [lemmatizer.lemmatize(word) for word in tokenized if word not in stopwords.words('english')]

# # if BIGRAMS:
# #     bigram = list(nltk.bigrams(lemmed))
# #     filtered_words = [i for i in map('_'.join, bigram)]
# # else:
# #     filtered_words = lemmed

# bgs = nltk.bigrams(lemmed)
  

Wall time: 76.8 ms
