# Set-up and data loading

**Note**: Much of the code related to loading and processing the articles here is taken from https://github.com/llbtl/paper_ssm01/tree/main. Any copyright of the code belongs to the authors of that paper. 

In [1]:
import numpy as np
import pandas as pd
import fitz
import nltk
from nltk.tokenize import sent_tokenize
import os
import re

In [3]:
path_articles = '..\\data\\articles'
fname_out = '..\\data_structured\\article_sentences_pdf.csv'

In [4]:
os.listdir(path_articles)

['beiersdorf.PDF',
 'colgate.PDF',
 'diageo.PDF',
 'ford-motor.PDF',
 'general-mills.PDF',
 'henkel.PDF',
 'hershey.PDF',
 'inditex.PDF',
 'komatsu.PDF',
 'linde.PDF',
 'mondelez.PDF',
 'ralph-lauren.PDF',
 'sonoco.PDF']

# Loading and Processing the NexisUni Articles

The following functions help us process the PDF files (NexisUni articles) into a table, where each row is represented by a sentence from the article text. I split the document into sentences already as I will generate sentence embeddings later on using S-BERT. 

In [5]:
def get_cnt(text):
    cnt = 0
    for word in text.split():
        if word.isalnum():
            cnt += 1
    return cnt

In [6]:
def get_text(block_lst):
    text_lst = []
    for block in block_lst:
        if block[6] != 0: continue # block_type: 0 = text
    
        text = ''.join([i if ord(i) < 128 else ' ' for i in block[4]]) #removes non-ascii characters already 
    
        #if get_cnt(text) > 5: 
        text_lst.append(text)
        
    return (text_lst)

In [9]:
def get_sentence(fname):
 
    doc = fitz.open(fname)
    
    text_lst = []
    for page_no, page in enumerate(doc):
        block_lst = page.get_text_blocks()
        text = get_text(block_lst)
        text_lst += text
    
    lst = []
    for i,text in enumerate(text_lst):
        # this is to take only the body of the text in the article pdfs
        if text == "Body\n":
            beg = i+1
        if text == "End of Document\n":
            end = i
            chunk = text_lst[beg:end]
            chunk[:] = (text for text in chunk if get_cnt(text) > 5)
        # for block in chunk:
        #     if get_cnt(block) < 5:
        #         chunk.remove(block)
        #     #block.replace('-\n', '')
            to_tokenize ='\n'.join(chunk)
            sent_lst = []
            for token in sent_tokenize(to_tokenize):
                sentences = token.split('\n\n')
                for sentence in sentences:
                    r_sent = ' '.join(sentence.split())
                    sent_lst.append(r_sent)
            lst += sent_lst
            
    doc.close()
    return lst

In [12]:
def gen_document(fname, sent_lst):

    res_df = pd.DataFrame(
        {
            'doc_type': "news",
            'company': fname.split(".")[0],
            'sentence': sent_lst
        }
    )
    
    return res_df

In [13]:
def read_filelist(path):

    # Create empty DataFrame
    df = pd.DataFrame()
    
    # Read file list (directory)
    for idx, fname in enumerate(os.listdir(path)):
        p_fname = os.path.join(path, fname)
        print('path + fname >>>', p_fname)
        
        if p_fname.split('.')[-1] != 'PDF': continue
        print('fname >>>',fname)
    
#         doc_id = int(idx)
        
#         print(f'doc_id = [{doc_id}], fname = [{fname}]')
#         print('')
    
        sent_lst = get_sentence(p_fname)
        df_doc   = gen_document(fname, sent_lst)
        
        df = pd.concat([df,df_doc])
        
    return df

In [14]:
%%time
df = read_filelist(path_articles)
print('==== End of jobs ====')

path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\articles\beiersdorf.PDF
fname >>> beiersdorf.PDF
path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\articles\colgate.PDF
fname >>> colgate.PDF
path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\articles\diageo.PDF
fname >>> diageo.PDF
path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\articles\ford-motor.PDF
fname >>> ford-motor.PDF
path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\articles\general-mills.PDF
fname >>> general-mills.PDF
path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\articles\henkel.PDF
fname >>> henkel.PDF
path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\articles\hershey.PDF
fname >>> hershey.PDF
path + fname >>> C:\Users\tn

# Further Cleaning

In [17]:
import re

In [18]:
df_article = df.copy()

In [19]:
# remove any quotes and unusual characters
df_article["sentence"] = df_article["sentence"].str.replace('"','', regex = True)
df_article["sentence"] = df_article["sentence"].replace(r'http\S+|\[.\]:?|www\S+|\w+/\S+|\w+-\w+-\S+|\[|\]','',regex = True).replace(r'^\s+|\s+$','',regex=True).replace(r'\s{2,}',' ',regex=True)
df_article['sentence'] = df_article['sentence'].str.replace('Nestl ', 'Nestle ')
df_article['sentence'] = df_article['sentence'].str.replace('Mondel z', 'Mondelez')

In [24]:
# removes empty sentence rows
df_article = df_article[df_article['sentence'].astype(bool)]

In [25]:
df_article.reset_index(inplace = True, drop = True)

In [26]:
# my pdf package does not know how to deal with sentences that span across pages - define a funciton here, which will merge the two sentences following each other
# if the previous one doesn't end with punctuation and the following starts with a lower case letter
import string

# define a function to check if a sentence ends with punctuation
def ends_with_punctuation(s):
    return s.strip()[-1] in string.punctuation

# loop over each row in the DataFrame and concatenate the sentences as needed
for i, row in df_article.iterrows():
    # skip the first row as there is no previous row to compare with
    if i == 0:
        continue
    
    # get the current and previous sentences
    prev_sentence = df_article.loc[i-1, 'sentence']
    curr_sentence = df_article.loc[i, 'sentence']
    
    # check if the previous sentence ends with punctuation and the current sentence starts with a lowercase letter
    if not ends_with_punctuation(prev_sentence) and curr_sentence[0].islower():
        # concatenate the sentences with a space
        df_article.at[i, 'sentence'] = prev_sentence + ' ' + curr_sentence
        # drop the previous row
        df_article.drop(i-1, inplace=True)

In [27]:
# filtering based on word count
df_article["word count"] = [len(i) for i in df_article["sentence"].str.split()]

In [28]:
df_article = df_article[df_article["word count"] > 5]

In [29]:
df_article = df_article[df_article["word count"] < 100]

In [30]:
# define a function to check if a sentence is comprised of more than half uppercase characters
def is_mostly_uppercase(sentence):
    return sum(1 for c in sentence if c.isupper()) / len(sentence) > 0.5

# apply the function to the 'sentence' column and filter out the rows where the condition is True
df_article = df_article[~df_article['sentence'].apply(is_mostly_uppercase)]

# print the resulting dataframe
df_article

Unnamed: 0,doc_type,company,sentence,word count
0,news,beiersdorf,"This offers businesses in food and beverage, p...",15
1,news,beiersdorf,"Designed with sustainability in mind, the tesa...",23
2,news,beiersdorf,"To reduce the consumption of virgin plastic, u...",24
3,news,beiersdorf,70 percent of the polyethylene (PET) that make...,16
4,news,beiersdorf,The tape supports the circular economy and can...,19
...,...,...,...,...
3944,news,sonoco,Newstex Authoritative Content is not read and ...,12
3945,news,sonoco,"Accordingly, neither Newstex nor its re-distri...",40
3946,news,sonoco,The Newstex Authoritative Content shall be con...,12
3947,news,sonoco,"Accordingly, no warranties or other guarantees...",25


In [31]:
df_article.reset_index(inplace = True, drop = True)

In [33]:
df_article.drop_duplicates(subset = ['sentence'], inplace = True)

In [55]:
df_article.to_csv(fname_out, index = False)