# Parse PDFs


### Sources
https://github.com/chrismattmann/tika-python

In [None]:
# required for parsing pdf. Make sure you have java installed  
!pip install tika

In [None]:
from collections import Counter
import csv
import matplotlib.pyplot as plt
import multiprocessing
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 
import numpy as np
import os
import pandas as pd
import pickle
import re
from tika import parser
import time

In [None]:
def preprocess(raw, raw_parser):
    """
    Takes in a string, lowercases all the text,
    fixes hyphenation, then tokenizes the input. 
    Outputs a list of all the tokens in the input document. 
    Each token is at least 3 chars long. 
    Also removed numbers, and stopwords. 
    Stopwords downloaded from nltk.downloads('stopwords'). 
    Can be set to read in raw PDFs.

    Parameters
    ----------
    first : string, or parser 
        `raw` is the pdf document
    second : bool
        True: read `raw` as parser file
        False: read `raw` as string

    Returns
    -------
    list
        A list of each token, stopword filtered

    """
    
    if raw_parser:
        lowered = raw['content'].lower()
    else:
        lowered = raw.lower()
    
    lemmatizer = WordNetLemmatizer() 
    additional_list = ['also', 'red', 'cross', 'international', 'federation', 'ndr']
    whitespace = re.sub(r'[\W]+', ' ', lowered)
    fixed = re.sub(r'[\-]\W+', '', whitespace)
    
    tokenizer = RegexpTokenizer(r'[A-z]{3,}')
    tokenized = tokenizer.tokenize(fixed)
    tokenized = [word for word in tokenized if word not in additional_list]
    filtered_words = [lemmatizer.lemmatize(word) for word in tokenized if word not in stopwords.words('english')]
    return filtered_words

In [None]:
def word_counter(tokenized, pdf_name, output_list):
    """
    Appends a dictionary of word, count, and source
    corpus for each unique word into output_list. 
    Returns nothing.

    Parameters
    ----------
    first : list
        `tokenized` is a list of tokens
        from a document
    second : string
        `pdf_name` is the name of the
        source PDF the word is from
    third : list
        `output_list` is the list of 
        dicts, with the top most common words
        appearing in the doc, formatted like 
        the following: 
            {"word": string, "count": int, "source": string}
        
    Returns
    -------
    Nothing

    """
    filtered_count = Counter(tokenized)
    for i in filtered_count.most_common():
        output_list.append({"word": i[0], "count": i[1], "source": pdf_name})
    return

In [None]:
def parse_year(item):
    """
    Parses the year from item.
    If a regex Match object is found, parses
    the Match object for years resembling either
    "20[0-9]{2}" or "19[0-9]{2}"
    
    Returns the int of the parsed year. 

    Parameters
    ----------
    first : string
        `item` is a string of file name.
        This is passed in from the pandas Df
        created by 'output_list'.
        
    Returns
    -------
    int
    
    """
    try:
        searchObj = re.search(r'20[0-9]{2}', item, re.M|re.I)
        if searchObj:
            year = searchObj.group()
    except:
        searchObj = re.search(r'19[0-9]{2}', item, re.M|re.I)
        if searchObj:
            year = searchObj.group()
    return int(year)

In [None]:
%%time

"""
For each PDF file housed in the 'data' folder,
either reads in a pickled output_list, or 
generates output_list. If a pickled output_list
is not found, it will be generated. 
If OUTPUT_AS_TXT == True, reads raw PDFs into
txt, and saves it into the 'txt' directory.
If OUTPUT_AS_TXT == False, reads from txt files.
If 'output_list' was generated, it will be pickled.
The functions 'preprocess' and 'word_counter'
is executed regardless. 

Create the dataframe using the list 'output_list'
Then, apply the function 'parse_year' to 
aforementioned df['source'] and set output as df['year']


Returns nothing, but creates df columns count, source, word, year.
Count is count of word, source is the source of the PDF
the word appeared in, word is the token, year is the 
year of PDF publication.

Parameters
----------
None


Returns
-------
Nothing

"""

LOAD_OUTPUT_LIST = True

if LOAD_OUTPUT_LIST:
    with open('model/output_list_pickled', 'rb') as f:
        output_list = pickle.load(f)
else:
    OUTPUT_AS_TXT = False
    output_list = []
    path = "data/"
    dirs = os.listdir(path)
    for each_pdf in dirs:
        print(each_pdf)
        if OUTPUT_AS_TXT:
            print("using raw pdf")
            raw = parser.from_file('data/{}'.format(each_pdf))
            with open('txt/{}.txt'.format(str(each_pdf)), 'wb') as f:
                f.write(raw['content'].encode("utf-8"))
            tokenized_pdf = preprocess(raw, OUTPUT_AS_TXT)
            word_counter(tokenized = tokenized_pdf, output_list = output_list, pdf_name = each_pdf)
        else:
            with open('txt/{}.txt'.format(str(each_pdf)), 'r', encoding='utf8') as f:
                text = f.read()
                print("\tpreprocessing")
                tokenized_pdf = preprocess(text, OUTPUT_AS_TXT)
                print("\tcounting")
                word_counter(tokenized = tokenized_pdf, output_list = output_list, pdf_name = each_pdf)
    with open('model/output_list_pickled', 'wb') as f:
        pickle.dump(output_list, f)

df = pd.DataFrame(output_list)
df['year'] = df['source'].apply(parse_year)

# Count freq of words

In [None]:
df[df['word'] == 'www']

### Total freq words

In [None]:
df_freq = df.groupby('word').sum()
df_freq.sort_values(by='count', ascending=False)[:10]

In [None]:
print("total number of words: {}".format(df_freq['count'].sum()))

In [None]:
np.sort(df['year'].unique())

### By year

In [None]:
df_slice = pd.DataFrame(columns=['count','word','year'])
for each_year in df['year'].unique():
    temp = df[df['year']==each_year][['count','word','year']].sort_values(by=['count'], ascending=False)[:10]
    print("{}\n\n".format(temp))

In [None]:
print("Total word counts by year")
df['year'].value_counts()

### Export as csv

In [None]:
df.to_csv('top_words.csv')