# Preprocess companies pdf file reports

- Count the numbers of words in a text with the use of the function [str.isalpha()]
- Tokenize sentences with the use of **NLTK** tokenize
- Group all sentences together as one row (one doc) of the original data frame of the extracted PDF pages 
- Remove all special characters from the text with the use of **regular expressions**
- Convert all strings in the data frame to lowercase with the use of str.lower() from pandas

# Imports

In [1]:
# standard imports
import os
import numpy as np
import pandas as pd
import nltk

# text processing
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import fitz
import re, string

# other useful imports
from importlib import reload
from time import time
from pathlib import Path
from tqdm.notebook import tqdm


#ignore warnings
import warnings
warnings.filterwarnings(action='ignore')

## Data Paths

In [2]:
# load files
DATA_PATH = 'data/reports'
SDG_PATH = 'data/sdg.csv'

# save files
FILE_NAME_SENT = 'data/sentences.csv'
FILE_NAME_DOC = 'data/doc_df.csv'
FILE_SDG_GROUPED = 'data/df_sdg_doc.csv'
FILE_SDG = 'data/df_sdg.csv'

# Helper Functions

### Get Count of Words

In [3]:
def word_count(text):
    """ Function that splits the text into words and counts the number of words in the text, counting only isalpha() 
    as words, i.e. words containing made up of letters only."""

    # split the text into words
    words = text.split()
    # count only words that are made up of letters
    cnt = sum(1 for word in words if word.isalpha())
    
    return cnt

In [4]:
# test word_count function
word_count('This is a 123-test, 1234?!')

3

## Extract Text  from PDF

Tokenize the sentences by using the [NLTK tokenizer](https://www.nltk.org/api/nltk.tokenize.html). Need a function to loop over the idx and the tokenized sentences, join and the splitted sentences and append them to the data. 

In [5]:
def generate_text_blocks(block_list, min_word_count=15):
    """ Function that processes the text blocks extracted from a pdf document and returns a single string."""

    # init an empty list to store the text
    text_list = []

    
    for block in block_list:
        # according to pdf extracting library block type -> 0 = text
        # if the  seventh element of the block is not 0, then it is 
        # not a text block and it gets skipped
        if block[6] != 0:
            continue
        
        # extract the text, assumed to be contained in the 5th element of the block
        text = block[4]

        #removes any occurences of the word 'Johnson' and 'Amazon' from the text
        text = text.replace('Johnson', '')
        text = text.replace('Amazon', '')

        # ir check the word count in the text and only extract sentences with more than min_word_count
        if word_count(text) < min_word_count:
            continue
        
        # removes hyphens at the end of line and append the text to the list
        text_list.append(text.replace('\n', ' '))

    # returns all processed tetx blocks as a single string
    return ('\n'.join(text_list))


def create_sentence_list(file):
    """ Function that interates over the page object, process the text into blocks,
        tokenizes the text into sentences and returns a list of tokenized sentences."""

    # open document
    doc = fitz.open(file)
    
    #print("Methods and attributes of the document object: ", dir(doc))
    #print("Number of pages: ", doc.page_count)
    #print("Metadata: ", doc.metadata)   

    sent_list = []
    for page_no, page in enumerate(doc):
        
        # extract unsorted page text and process into blocks as a list of items, (x0, y0, x1, y1, "lines in block", block_no, block_type)
        # for following the order of " top-left to bottom-right" sort=True, see https://pymupdf.readthedocs.io/en/latest/app1.html#plain-text
        block_list = page.get_text('blocks', sort=False)
        text = generate_text_blocks(block_list)
        #print(f"text block from a page {page}: {text}")

        # TODO
        # Tokenize the page text into sentences
        sentences = nltk.tokenize.sent_tokenize(text)
    
        for i, sentence in enumerate(sentences):
        
            # print(f"sentence {i}: {sentence}")
            # append the sentence to the list
            sent_list.append(sentence)
            
    # close document
    doc.close()

    return sent_list

In [6]:
# test create_sentence_list function
file = 'data/reports/Amazon.pdf'
create_sentence_list(file)

[' seeks to be Earth’s most customer-centric  company, Earth’s best employer, and Earth’s safest place  to work.',
 'We are passionate builders guided by four  principles: customer obsession rather than competitor  focus, passion for invention, commitment to operational  excellence, and long-term thinking.',
 'In each of our segments we serve our primary customer  sets, consisting of consumers, sellers, developers,  enterprises, and content creators.',
 'In addition, we provide  services, such as advertising to sellers, vendors, publishers,  authors, and others, through programs such as sponsored  ads, display, and video advertising.',
 'We have organized  our operations into three segments: North America,  International, and  Web Services (AWS).',
 'We design our stores to enable hundreds of millions of  unique products to be sold by us and by third parties  across dozens of product categories.',
 'Customers access  our offerings through our websites, mobile apps, Alexa,  devices, str

### Read PDFs and create data frame

In [7]:
def create_df(path):
    """ Function that reads all pdf files from a given directory, extract sentences from each file,  
        storing the sentences along  with the file name and the document id; it returns a 
        data frame with the following columns: doc_id, file_name, sentence."""
    
    # Create empty data frame, concat to get all pdf files
    df = pd.DataFrame()
    
    # intialize doc_id used to a sign a unique id to each document
    doc_id = 0

    # interate over all files from directory, check if it's a pdf file and extract sentences
    for i, pdf in tqdm(enumerate(os.listdir(path))):
        file = os.path.join(path, pdf)
        
        # make sure it's a pdf file
        if file.split('.')[-1] != 'pdf':
            continue
        
        # increase id
        doc_id += 1

        # extract sentences from the pdf with three columns: doc_id, file_name, sentence where the
        # sentence column contains the list of tokenized sentences   extracted from the pdf
        sent_list = create_sentence_list(file)
        
        # create data frame
        df_pdfs = pd.DataFrame({
            'doc_id': doc_id,
            'file_name': pdf,
            'sentence': sent_list
        })
        
        # concatenatet sentences along with file name and document id from all directory pdfs into a single data frame
        df = pd.concat([df, df_pdfs])

    return df

### Create new DataFrame from PDF Files

In [8]:
df = create_df(DATA_PATH)

0it [00:00, ?it/s]

In [9]:
df

Unnamed: 0,doc_id,file_name,sentence
0,1,United Health Group.pdf,Introduction A message from our CEO A message...
1,1,United Health Group.pdf,"At UnitedHealth Group, we believe a healthy p..."
2,1,United Health Group.pdf,"The more than 350,000 people across Optum and..."
3,1,United Health Group.pdf,Given our reach and resources – and the milli...
4,1,United Health Group.pdf,That’s what makes a health care system sustai...
...,...,...,...
1443,14,Amazon.pdf,All statements other than statements of histor...
1444,14,Amazon.pdf,"We use words such as aim, believe, commit, dr..."
1445,14,Amazon.pdf,Forward-looking statements reflect management’...
1446,14,Amazon.pdf,Actual results could differ materially due to ...


Group all sentences of one company into single row (one document) instead of having for every sentence a single row of the frame 

In [10]:
# group all sentences together as one document
doc_df = df.groupby('doc_id')['sentence'].apply(lambda x: ' '.join(x)).reset_index()
doc_df

Unnamed: 0,doc_id,sentence
0,1,Introduction A message from our CEO A message...
1,2,This past year has brought disruption and stre...
2,3,I approach this with a strong point of view. I...
3,4,This Report details the progress of the & Fa...
4,5,Climate Change and Greenhouse Gas Emissions 52...
5,6,"This Environmental, Social and Governance Repo..."
6,7,216 million people could be forced to migrat...
7,8,Current ESG evaluation methodologies are fun...
8,9,"For the best experience, we recommend using t..."
9,10,Cover photo This North Carolina solar facility...


# Text Preprocessing

**Text extraction** <br>
Extracting the useful text from data is important for high-quality mining. These types of settings require specialized parsing and extraction techniques. Use `PyMuPDF` a Python binding for [MuPDF](http://www.mupdf.com/) – a lightweight PDF, XPS, and e-book viewer, renderer, and toolkit, which is maintained and developed by Artifex Software, Inc. and is open source.

**Stop word removal** <br>
Are words such as common pronouns - they/them/theirs etc., articles, and prepositions - the, is, that add nothing to the NLP learning process infact hamper the models' performance.

**Tokenization** <br>
This is the process of converting a big quantity of text into tokens, smaller chunks with [sentence tokenizer from NLTK](https://www.nltk.org/api/nltk.tokenize.html)


**Regular Expression** <br>
For text manipulation during the text cleaning step of natural language processing (NLP), regular expression (_regex_) is highly helpful. Regex are useed for cleaning up emojis, misspelled words, short words, unusual symbols, and so on are all present in the actual human-written text data. For instance, Twitter's tweet has a lot of distracting content, such as hashtags, emojis and even some slang or abbreviated words, therefore a tweet must be cleaned up of useless information before being  used. 

## Clean Data

In [11]:
df_sdg = pd.read_csv(SDG_PATH)

In [12]:
df_sdg.head()

Unnamed: 0,gpnum,gpname,goalnum,sentence
0,gp01,Life,goal01,End poverty in all its forms everywhere
1,gp01,Life,goal01,"Despite progress under the MDGs, approximately..."
2,gp01,Life,goal01,"Over the past decade, markets in developing co..."
3,gp01,Life,goal01,Certain groups are disproportionately represen...
4,gp01,Life,goal01,"These include women, persons with disabilities..."


In [13]:
df_sdg_doc = df_sdg.groupby('gpname')['sentence'].agg(' '.join).reset_index()

# save the "themes" in gpname column as list for later use
sdg_theme_list = df_sdg_doc['gpname'].tolist()


In [14]:
print(f"number of themes: {len(sdg_theme_list)}")
df_sdg_doc.head()

number of themes: 6


Unnamed: 0,gpname,sentence
0,Economic and Technological Development,"economic growth, full and productive employmen..."
1,Environments,Take urgent action to combat climate change an...
2,Equity,Ensure inclusive and equitable quality educati...
3,Life,End poverty in all its forms everywhere Despit...
4,Resources,Ensure availability and sustainable management...


## Remove special characters

In [15]:
# remove special characters still match everything between a-z and A-Z, 
# keeping  only alphabetic characters, replacing reg substrings with space
df_sdg['sentence'] =  df_sdg['sentence'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

# remove all the sdg themes from the sentences:
# create a regular expression pattern to match all the themes by joining the list of themes with the |="or" operator
# \b(?:{})\b is a word bounary, will match the words in  sdg_theme_list only if they appear as whole words not as 
# parts of another word and compile the regular expression pattern into  a regular expression object
regex_match = re.compile(r'\b(?:{})\b'.format('|'.join(sdg_theme_list)))
# applly  the regular  expression  to each sentence, replacing all occurences of the themes with space
df_sdg['sentence'] = df_sdg['sentence'].apply(lambda x: regex_match.sub(' ', x))

# convert all to lower case
df_sdg = df_sdg.apply(lambda x: x.astype(str).str.lower())

df_sdg

Unnamed: 0,gpnum,gpname,goalnum,sentence
0,gp01,life,goal01,end poverty in all its forms everywhere
1,gp01,life,goal01,despite progress under the mdgs approximately...
2,gp01,life,goal01,over the past decade markets in developing co...
3,gp01,life,goal01,certain groups are disproportionately represen...
4,gp01,life,goal01,these include women persons with disabilities...
...,...,...,...,...
636,gp06,environments,goal15,take urgent action to end poaching and tr...
637,gp06,environments,goal15,by introduce measures to prevent th...
638,gp06,environments,goal15,by integrate ecosystems and biodive...
639,gp06,environments,goal15,a mobilize and significantly increase from ...


## Export Data to CSV Files

In [16]:
print ('Saving files...')
# document with all text of each PDF file of a company in one row
doc_df.to_csv(FILE_NAME_DOC, index=False)

# each sentence in a separate row
df.to_csv(FILE_NAME_SENT, index=False)

# SDG data with groupby
df_sdg_doc.to_csv(FILE_SDG_GROUPED, index=False)

# cleaned SDG data
df_sdg.to_csv(FILE_SDG, index=False)

Saving files...


In [17]:
# open the file and check the first 5 rows
print('Opening the files...')
print('doc_df')
print(pd.read_csv(FILE_NAME_DOC).head())
print('df')
print(pd.read_csv(FILE_NAME_SENT).head())
print('df_sdg_doc')
print(pd.read_csv(FILE_SDG_GROUPED).head())
print('df_sdg')
print(pd.read_csv(FILE_SDG).head())


Opening the files...
doc_df
   doc_id                                           sentence
0       1  Introduction A message from our CEO  A message...
1       2  This past year has brought disruption and stre...
2       3  I approach this with a strong point of view. I...
3       4  This Report details the progress of the  &  Fa...
4       5  Climate Change and Greenhouse Gas Emissions 52...
df
   doc_id                file_name  \
0       1  United Health Group.pdf   
1       1  United Health Group.pdf   
2       1  United Health Group.pdf   
3       1  United Health Group.pdf   
4       1  United Health Group.pdf   

                                            sentence  
0  Introduction A message from our CEO  A message...  
1  At UnitedHealth Group, we believe a healthy  p...  
2  The more  than 350,000 people across Optum and...  
3  Given our reach and resources – and the  milli...  
4  That’s  what makes a health care system sustai...  
df_sdg_doc
                                 