THIS IS A IPYNB FILE WHERE I PLAYED AROUND WITH DATA FROM PREPROCESSING TO FINAL OUTPUT

In [30]:
# To read the PDF
import PyPDF2
# To analyze the PDF layout and extract text
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
# To extract text from tables in PDF
import pdfplumber
# To extract the images from the PDFs
from PIL import Image
from pdf2image import convert_from_path
# To perform OCR to extract text from images 
import pytesseract 
# To remove the additional created files
import os

In [15]:
def text_extraction(element):
    # Extracting the text from the in-line text element
    line_text = element.get_text()
    
    # Find the formats of the text
    # Initialize the list with all the formats that appeared in the line of text
    line_formats = []
    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            # Iterating through each character in the line of text
            for character in text_line:
                if isinstance(character, LTChar):
                    # Append the font name of the character
                    line_formats.append(character.fontname)
                    # Append the font size of the character
                    line_formats.append(character.size)
    # Find the unique font sizes and names in the line
    format_per_line = list(set(line_formats))
    
    # Return a tuple with the text in each line along with its format
    return (line_text, format_per_line)

def crop_image(element, pageObj):
    # Get the coordinates to crop the image from the PDF
    [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1] 
    # Crop the page using coordinates (left, bottom, right, top)
    pageObj.mediabox.lower_left = (image_left, image_bottom)
    pageObj.mediabox.upper_right = (image_right, image_top)
    # Save the cropped page to a new PDF
    cropped_pdf_writer = PyPDF2.PdfWriter()
    cropped_pdf_writer.add_page(pageObj)
    # Save the cropped PDF to a new file
    with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
        cropped_pdf_writer.write(cropped_pdf_file)

# Create a function to convert the PDF to images
def convert_to_images(input_file,):
    images = convert_from_path(input_file)
    image = images[0]
    output_file = "PDF_image.png"
    image.save(output_file, "PNG")

# Create a function to read text from images
def image_to_text(image_path):
    # Read the image
    img = Image.open(image_path)
    # Extract the text from the image
    text = pytesseract.image_to_string(img)
    return text


def extract_table(pdf_path, page_num, table_num):
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    table_page = pdf.pages[page_num]
    # Extract the appropriate table
    table = table_page.extract_tables()[table_num]
    return table

# Convert table into the appropriate format
def table_converter(table):
    table_string = ''
    # Iterate through each row of the table
    for row_num in range(len(table)):
        row = table[row_num]
        # Remove the line breaker from the wrapped texts
        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
        # Convert the table into a string 
        table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
    # Removing the last line break
    table_string = table_string[:-1]
    return table_string




In [79]:
pdf_path = 'pdfs/paper2.pdf'

# create a PDF file object
pdfFileObj = open(pdf_path, 'rb')
# create a PDF reader object
pdfReaded = PyPDF2.PdfReader(pdfFileObj)

# Create the dictionary to extract text from each image
text_per_page = {}
# We extract the pages from the PDF
for pagenum, page in enumerate(extract_pages(pdf_path)):
    
    # Initialize the variables needed for the text extraction from the page
    pageObj = pdfReaded.pages[pagenum]
    page_text = []
    line_format = []
    text_from_images = []
    text_from_tables = []
    page_content = []
    # Initialize the number of the examined tables
    table_num = 0
    first_element= True
    table_extraction_flag= False
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    page_tables = pdf.pages[pagenum]
    # Find the number of tables on the page
    tables = page_tables.find_tables()


    # Find all the elements
    page_elements = [(element.y1, element) for element in page._objs]
    # Sort all the elements as they appear in the page 
    page_elements.sort(key=lambda a: a[0], reverse=True)

    for i,component in enumerate(page_elements):
        # Extract the position of the top side of the element in the PDF
        pos= component[0]
        # Extract the element of the page layout
        element = component[1]
        
        # Check if the element is a text element
        if isinstance(element, LTTextContainer):
            # Check if the text appeared in a table
            if table_extraction_flag == False:
                # Use the function to extract the text and format for each text element
                (line_text, format_per_line) = text_extraction(element)
                # Append the text of each line to the page text
                page_text.append(line_text)
                # Append the format for each line containing text
                line_format.append(format_per_line)
                page_content.append(line_text)
            else:
                # Omit the text that appeared in a table
                pass

        # Check the elements for images
        if isinstance(element, LTFigure):
            # Crop the image from the PDF
            crop_image(element, pageObj)
            # Convert the cropped pdf to an image
            convert_to_images('cropped_image.pdf')
            # Extract the text from the image
            image_text = image_to_text('PDF_image.png')
            text_from_images.append(image_text)
            page_content.append(image_text)
            # Add a placeholder in the text and format lists
            page_text.append('image')
            line_format.append('image')

        if isinstance(element, LTRect):
            # If the first rectangular element
            if first_element == True and (table_num+1) <= len(tables):
                # Find the bounding box of the table
                lower_side = page.bbox[3] - tables[table_num].bbox[3]
                upper_side = element.y1 
                # Extract the information from the table
                table = extract_table(pdf_path, pagenum, table_num)
                # Convert the table information in structured string format
                table_string = table_converter(table)
                # Append the table string into a list
                text_from_tables.append(table_string)
                page_content.append(table_string)
                # Set the flag as True to avoid the content again
                table_extraction_flag = True
                # Make it another element
                first_element = False
                # Add a placeholder in the text and format lists
                page_text.append('table')
                line_format.append('table')

            # Check if we already extracted the tables from the page
            if element.y0 >= lower_side and element.y1 <= upper_side:
                pass
            elif not isinstance(page_elements[i+1][1], LTRect):
                table_extraction_flag = False
                first_element = True
                table_num+=1


    dctkey = 'Page_'+str(pagenum)
    # Add the list of list as the value of the page key
    text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]

# Closing the pdf file object
pdfFileObj.close()

# Deleting the additional files created
# os.remove('cropped_image.pdf')
# os.remove('PDF_image.png')

# Display the content of the page
result = ''.join(text_per_page['Page_0'][4])
print(result)


Battle of the markups: conflict inflation 
and the aspirational channel of 
monetary policy transmission
Staff Working Paper No. 1,065
March 2024
|Battle of the markups: conflict inflation and the aspirational channel of monetary policy transmission Staff Working Paper No. 1,065 March 2024|
|Frederick van der Ploeg and Tim Willems|
||
|Staff Working Papers describe research in progress by the author(s) and are published to elicit comments and to further debate. Any views expressed are solely those of the author(s) and so cannot be taken to represent those of the Bank of England or to state Bank of England policy. This paper should therefore not be reported as representing the views of the Bank of England or members of the Monetary Policy Committee, Financial Policy Committee or Prudential Regulation Committee.|


In [19]:
import pdfplumber
import re

# Function to clean text
def clean_text(text):
    # Remove any special characters
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove extra whitespaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text.strip()

# Function to extract headers, footers, and page numbers from a PDF
def extract_headers_footers_page_numbers(pdf_path):
    headers = []
    footers = []
    page_numbers = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text from the current page
            page_text = page.extract_text()
            
            # Assume headers/footers are at the start/end of each page text
            # This may need customization for different formatting of PDFs
            lines = page_text.split('\n')
            header = lines[0]
            footer = lines[-1]
            
            # Extract page number assuming it is a number at the footer
            # This regex will find a standalone number which is often used for page numbers
            page_number_match = re.search(r'\b\d+\b', footer)
            
            # Clean and add to lists
            headers.append(clean_text(header))
            footers.append(clean_text(footer))
            page_numbers.append(page_number_match.group(0) if page_number_match else None)
    
    return headers, footers, page_numbers

# Replace 'pdf_path' with the path to your PDF file
pdf_path = 'pdfs/paper1.pdf'

# Call the function and print the extracted content
headers, footers, page_numbers = extract_headers_footers_page_numbers(pdf_path)
print("Headers:", headers)
print("Footers:", footers)
print("Page Numbers:", page_numbers)


Headers: ['ECONOMIC RESEARCH', 'Sluggish news reactions A combinatorial', '1 Introduction', '2 Synchronizing jumps A combinatorial problem', 'Figure 1 Jump sampling', 'consensus about the impact of some piece of news If trades do not occur at the time of', 'Figure 2 The discontinuous component of a stocks price may react sluggishly to news', 'their jumps3', 'in which the returns are reported in percentages and the jump returns are underlined The', 'This stock return is composed of a jump return defined as nJ nY IJump', 'The rowsums of this matrix show its practical value Within we rearrange the stock', 'A rearrangement of the jumpevent matrix as defined in 12 is a new matrix denoted', 'The range R is the difference between the highest and lowest rowsums in our rearranged', 'distance from the diagonal to not let the jumps stray too far in the event window see Section', 'Flattening the rowsums by choosing the best permutation matrices', 'The permutation constraints', 'D to monitor backwa

In [20]:
import pdfplumber
import re

def clean_text(text):
    """ Remove special characters and excessive whitespace. """
    text = re.sub(r'[^a-zA-Z0-9\s,.]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def extract_pdf_content(pdf_path):
    """ Extract headers, footers, headings, and page numbers from a PDF file. """
    headers, footers, headings, page_numbers = [], [], [], []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text_objects = page.extract_text(x_tolerance=3, y_tolerance=3)
            if text_objects:
                lines = text_objects.split('\n')
                if lines:
                    headers.append(clean_text(lines[0]))
                    if re.search(r'\*|\d', lines[-1]):
                        footers.append(clean_text(lines[-1]))
                        page_num_search = re.search(r'\d+', lines[-1])
                        if page_num_search:
                            page_numbers.append(page_num_search.group())
                    for line in lines:
                        if line.isupper():  # Assuming headings are in uppercase
                            headings.append(clean_text(line))
    return headers, footers, headings, page_numbers

# Example usage
pdf_path = 'pdfs/paper1.pdf'
headers, footers, headings, page_numbers = extract_pdf_content(pdf_path)
print("Headers:", headers)
print("Footers:", footers)
print("Headings:", headings)
print("Page Numbers:", page_numbers)


Headers: ['ECONOMIC RESEARCH', 'Sluggish news reactions A combinatorial', '1 Introduction', '2 Synchronizing jumps A combinatorial problem', 'Figure 1 Jump sampling', 'consensus about the impact of some piece of news. If trades do not occur at the time of', 'Figure 2 The discontinuous component of a stocks price may react sluggishly to news', 'their jumps3', 'in which the returns are reported in percentages and the jump returns are underlined. The', 'This stock return is composed of a jump return, defined as nJ nY IJump ,', 'The rowsums of this matrix show its practical value. Within , we rearrange the stock', 'A rearrangement of the jumpevent matrix, as defined in 12, is a new matrix, denoted', 'The range R is the difference between the highest and lowest rowsums in our rearranged', 'distance from the diagonal to not let the jumps stray too far in the event window see Section', 'Flattening the rowsums by choosing the best permutation matrices', 'The permutation constraints', 'D to mon

In [34]:
import fitz

pages = fitz.open("pdfs/paper1.pdf")

def extract_text(page):
    '''Extract text from a page and returns a list of strings'''
    text = page.get_text(sort=True)
    text = text.split('\n')
    text = [t.strip() for t in text if t.strip()]
            
    return text

pages = [extract_text(page) for page in pages]

print(pages[0])

#pages are list of list 

['ECONOMIC RESEARCH', 'FEDERAL RESERVE BANK OF ST. LOUIS', 'WORKING PAPER SERIES', 'Sluggish news reactions: A combinatorial approach for', 'synchronizing stock jumps', 'Authors', 'Nabil Bouamara, Kris Boudt, Sébastien Laurent, and Christopher J. Neely', 'Working Paper Number', '2024-006A', 'Creation Date', 'March 2024', 'Citable Link', 'https://doi.org/10.20955/wp.2024.006', 'Suggested Citation', 'Bouamara, N., Boudt, K., Laurent, S., Neely, C.J., 2024; Sluggish news reactions: A', 'combinatorial approach for synchronizing stock jumps, Federal Reserve Bank of St.', 'Louis Working Paper 2024-006. URL https://doi.org/10.20955/wp.2024.006', 'Federal Reserve Bank of St. Louis, Research Division, P.O. Box 442, St. Louis, MO 63166', 'The views expressed in this paper are those of the author(s) and do not necessarily reflect the views of the Federal Reserve', 'System, the Board of Governors, or the regional Federal Reserve Banks. Federal Reserve Bank of St. Louis Working Papers', 'are prelim

In [36]:
header_candidates = []
footer_candidates = []
    
for page in pages:
    header_candidates.append(page[:5])
    footer_candidates.append(page[-5:])
    
WIN = 8

print(header_candidates[0])

['ECONOMIC RESEARCH', 'FEDERAL RESERVE BANK OF ST. LOUIS', 'WORKING PAPER SERIES', 'Sluggish news reactions: A combinatorial approach for', 'synchronizing stock jumps']


In [39]:
def compare(a, b):
    '''Fuzzy matching of strings to compare headers/footers in neighboring pages'''
    
    count = 0
    a = re.sub('\d', '@', a)
    b = re.sub('\d', '@', b)
    for x, y in zip(a, b):
        if x == y:
            count += 1
    return count / max(len(a), len(b))

def remove_header(pages, header_candidates, WIN):
    '''Remove headers from content dictionary. Helper function for remove_header_footer() function.'''
    
    header_weights = [1.0, 0.75, 0.5, 0.5, 0.5]
    
    for i, candidate in enumerate(header_candidates):
        temp = header_candidates[max(i-WIN, 1) : min(i+WIN, len(header_candidates))]
        maxlen = len(max(temp, key=len))
        for sublist in temp:
            sublist[:] =  sublist + [''] * (maxlen - len(sublist))
        detected = []
        for j, cn in enumerate(candidate):
            score = 0
            try:
                cmp = list(list(zip(*temp))[j])
                for cm in cmp:
                    score += compare(cn,cm) * header_weights[j]
                score = score/len(cmp)
            except:
                score = header_weights[j]
            if score > 0.5:
                detected.append(cn)
        del temp
        
        for d in detected:
            while d in pages[i][:5]:
                pages[i].remove(d)
                
    return pages

def remove_footer(pages, footer_candidates, WIN):
    '''Remove footers from content dictionary. Helper function for remove_header_footer() function.'''
    
    footer_weights = [0.5, 0.5, 0.5, 0.75, 1.0]
    
    for i, candidate in enumerate(footer_candidates):
        temp = footer_candidates[max(i-WIN, 1) : min(i+WIN, len(footer_candidates))]
        maxlen = len(max(temp, key=len))
        for sublist in temp:
            sublist[:] =  [''] * (maxlen - len(sublist)) + sublist
        detected = []
        for j, cn in enumerate(candidate):
            score = 0
            try:
                cmp = list(list(zip(*temp))[j])
                for cm in cmp:
                    score += compare(cn,cm)
                score = score/len(cmp)
            except:
                score = footer_weights[j]
            if score > 0.5:
                detected.append(cn)
        del temp
        
        for d in detected:
            while d in pages[i][-5:]:
                pages[i] = pages[i][::-1]
                pages[i].remove(d)
                pages[i] = pages[i][::-1]
                
    return pages

pages = remove_header(pages, header_candidates, WIN)
pages = remove_footer(pages, footer_candidates, WIN)


In [153]:
print(pages[3])

['2', 'Synchronizing jumps: A combinatorial problem', 'Trading is non-synchronous at high frequencies; This leads to so-called “stale” prices at irregu-', 'larly spaced times, diﬀering across assets. Addressing asynchronicity through the coordinated', 'collection of multivariate data has been an active area of research in ﬁnancial econometrics in', 'recent years, see e.g., Barndorﬀ-Nielsen et al. (2011) or Boudt et al. (2017) and the references', 'therein, and the topic has been integral to covariance estimations since at least Epps (1979).', 'Nonetheless, existing sampling schemes like refresh-time sampling (Barndorﬀ-Nielsen et al.,', '2011), are not designed to handle price jumps, because stale prices are not the only cause of', 'asynchronous jumps. Sometimes asset prices may be “sluggish”; news might be impounded', 'in prices with a delay, although the asset is trading. To address this problem, we synchronize', 'the timing of multivariate jumps using what we call “Jump Sampling”. Th

TO EXTRACT FOOTER (SUCCESSFUL)

In [1]:
from operator import itemgetter
import fitz
import json

import re


def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool

    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.

    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict

    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag


def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict

    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    foots = []
    Raw_data = []
    first = True  # boolean operator for first header
    previous_s = {}  # previous span
    abstract = []

    page_attribute = {}

    for pagenum, page in enumerate(doc):
        foots = []
        Raw_data = []
        
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                        
                    block_string += " "

                if re.match(r'^<s4>', block_string):
                        block_string= re.sub(r'^<s4>', '', block_string, flags=re.MULTILINE)
                        foots.append(block_string)

                elif re.match(r'^<p>', block_string):
                        
                        block_string= re.sub(r'^<p>', '', block_string, flags=re.MULTILINE)
                        Raw_data.append(block_string)

                header_para.append(block_string)

            
        mapkey = 'Page_'+str(pagenum)
        page_attribute[mapkey] = {'footer': foots}

    return header_para,foots,Raw_data,abstract,page_attribute


# def main():

document = 'pdfs/paper1.pdf'
doc = fitz.open(document)

font_counts, styles = fonts(doc, granularity=False)

print(font_counts)

# print(styles)

size_tag = font_tags(font_counts, styles)

# print(size_tag)

elements,foots,raw_data, abstract, page_attribute = headers_para(doc, size_tag)

# print(elements)
# print(foots)
# print(raw_data)
#print(abstract)

# print(page_attribute['Page_7'])

with open("doc.json", 'w') as json_out:
    json.dump(elements, json_out)


# if __name__ == '__main__':
#     main()

[('11.9552001953125', 3622), ('7.970099925994873', 940), ('10.678384780883789', 180), ('5.97760009765625', 136), ('10.909099578857422', 120), ('7.665040016174316', 115), ('9.962599754333496', 87), ('6.973800182342529', 56), ('7.473479747772217', 43), ('6.191279888153076', 33), ('14.346199989318848', 31), ('11.14452075958252', 25), ('9.705060958862305', 16), ('9.774545669555664', 13), ('17.21540069580078', 13), ('4.981299877166748', 7), ('12.0302095413208', 3), ('9.02265739440918', 3), ('5.605109691619873', 3), ('16.54153823852539', 2), ('20.662500381469727', 2), ('22.556642532348633', 1), ('7.429680347442627', 1)]


In [34]:
page_attribute['Page_3']['footer']

['Asynchronous jumps in asset prices happen for several reasons, not just diﬀerences in liquidity. Assets  may react diﬀerently to news based on the traders involved, their beliefs, and actions. For example, major  news like the FOMC statement on September 18, 2007, discussed in Section 3.2, triggered an increase in  trading, unlike the typical activity seen throughout the rest of the day (not reported). This suggests that  time variation in trading intensity contributes to asynchronous jumps. ',
 'Gradual jumps are when the prices exhibit strong linear trends for periods of a few minutes (Barndorﬀ-  Nielsen et al., 2009). Jump delays are when jumps of individual assets follow those of the highly liquid market  index during market-wide events (Li et al., 2017). ']

In [329]:
page_attribute['Page_3']['Raw_data']

[', refresh-time sampling does not resolve  the asynchronicity inherent in the jumps.  To confront asynchronicity, our new jump sampling scheme generally rearranges mist-  imed jumps to occur simultaneously with the ETF jump, as shown in the lower section of  the diagram. In what follows, we detail how we optimally rearrange jumps, penalizing eco-  nomically implausible rearrangements. A simulated example clariﬁes the mechanics of the  rearrangements. ',
 ' that represents infor- ',
 '3 ']

TO EXTRACT ABSTRACT (IMPORTANT)

In [35]:
import pdfplumber

def extract_pdf_content(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        first_page = pdf.pages[1]  # Assuming title, authors, and abstract are on the first page
        second_page = pdf.pages[1] 
        text = first_page.extract_text()
        text_1 = second_page.extract_text()

        # This is a simple example and may need to be adjusted based on your specific PDF layout
        title = text.split('\n')[0]  # Assuming the first line is the title
        authors = text.split('\n')[3]  # Assuming the second line contains the authors

        abstract_index = text.lower().find('abstract')
        end_index = text.lower().find('keywords')

        if end_index == -1 :

            end_index = text.lower().find('key words')

        print(end_index)
        abstract_text = text[abstract_index : end_index].split('\n')[1:]  # Assuming abstract starts after the 'abstract' keyword

        abstract = ' '.join(abstract_text).replace('\n', ' ')

    return {
        'title': title,
        'authors': authors,
        'abstract': abstract
        
    }

# Usage
pdf_path = 'pdfs/paper1.pdf'
content = extract_pdf_content(pdf_path)
print(content)

886
{'title': 'Sluggish news reactions: A combinatorial', 'authors': 'Nabil Bouamara', 'abstract': 'Stock prices often react sluggishly to news, producing gradual jumps and jump delays. Econometricianstypicallytreatthesesluggishreactionsasmicrostructureeffects and settle for a coarse sampling grid to guard against them. Synchronizing mistimed stock returns on a fine sampling grid allows us to better approximate the true common jumps in related stock prices. '}


Self-Conceptual Method

In [156]:
import pdfplumber
import re

def extract_information(pdf_path):
    title = None
    authors = None
    abstract = None
    
    with pdfplumber.open(pdf_path) as pdf:
        first_page = pdf.pages[1]  # Assuming all relevant information is on the first page
        text = first_page.extract_text()
        
        # Regex patterns to extract title, authors, and abstract
        title_pattern = r"Title:\s*(.+)"
        authors_pattern = r"Authors:\s*(.+)"
        abstract_pattern = r"Abstract\s*(.+)"
        
        # Searching for patterns
        title_match = re.search(title_pattern, text)
        authors_match = re.search(authors_pattern, text)
        abstract_match = re.search(abstract_pattern, text, re.DOTALL)  # DOTALL to match across multiple lines
        
        if title_match:
            title = title_match.group(1)
        if authors_match:
            authors = authors_match.group(1)
        if abstract_match:
            abstract = abstract_match.group(1).strip()  # Strip to remove any leading/trailing whitespace
        
    return title, authors, abstract

# Example usage
pdf_path = 'pdfs/paper1.pdf'
title, authors, abstract = extract_information(pdf_path)
print("Title:", title)
print("Authors:", authors)
print("Abstract:", abstract)

Title: None
Authors: None
Abstract: Stock prices often react sluggishly to news, producing gradual jumps and jump
delays. Econometricianstypicallytreatthesesluggishreactionsasmicrostructureeffects
and settle for a coarse sampling grid to guard against them. Synchronizing mistimed
stock returns on a fine sampling grid allows us to better approximate the true common
jumps in related stock prices.
Keywords: Asynchronicity; Cojumps; High-frequency data; Microstructure noise; Real-
ized Covariance; Rearrangement
JEL: C02, C58, G11, G14
∗We have received helpful comments and suggestions from Andres Algaba, Geert Dhaene, Jean-Yves
Gnabo, Roxana Halbleib, Ilze Kalnina, Nathan Lassance, Oliver Linton, Andr´e Lucas, Kristien Smedts,
Lisa Van den Branden, Steven Vanduffel, Brecht Verbeken, and the conference and seminar participants
at KU Leuven, Vrije Universiteit Brussel, Vrije Universiteit Amsterdam, the Computational and Financial
Econometrics Conference (2021), the Belgian Financial Research

TO extract IMAGES, TABLES AND EVERYHTING // SUCCESSFUL--- IMPORTANT

In [17]:
# Create function to extract text

def text_extraction(element):
    # Extracting the text from the in line text element
    line_text = element.get_text()
    
    # Find the formats of the text
    # Initialize the list with all the formats appeared in the line of text
    line_formats = []
    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            # Iterating through each character in the line of text
            for character in text_line:
                if isinstance(character, LTChar):
                    # Append the font name of the character
                    line_formats.append(character.fontname)
                    # Append the font size of the character
                    line_formats.append(character.size)
    # Find the unique font sizes and names in the line
    format_per_line = list(set(line_formats))
    
    # Return a tuple with the text in each line along with its format
    return (line_text, format_per_line)

In [18]:
# Extracting tables from the page

def extract_table(pdf_path, page_num, table_num):
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    table_page = pdf.pages[page_num]
    # Extract the appropriate table
    table = table_page.extract_tables()[table_num]
    
    return table

# Convert table into appropriate fromat
def table_converter(table):
    table_string = ''
    # Iterate through each row of the table
    for row_num in range(len(table)):
        row = table[row_num]
        # Remove the line breaker from the wrapted texts
        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
        # Convert the table into a string 
        table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
    # Removing the last line break
    table_string = table_string[:-1]
    return table_string

# Create a function to check if the element is in any tables present in the page
def is_element_inside_any_table(element, page ,tables):
    x0, y0up, x1, y1up = element.bbox
    # Change the cordinates because the pdfminer counts from the botton to top of the page
    y0 = page.bbox[3] - y1up
    y1 = page.bbox[3] - y0up
    for table in tables:
        tx0, ty0, tx1, ty1 = table.bbox
        if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
            return True
    return False

# Function to find the table for a given element
def find_table_for_element(element, page ,tables):
    x0, y0up, x1, y1up = element.bbox
    # Change the cordinates because the pdfminer counts from the botton to top of the page
    y0 = page.bbox[3] - y1up
    y1 = page.bbox[3] - y0up
    for i, table in enumerate(tables):
        tx0, ty0, tx1, ty1 = table.bbox
        if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
            return i  # Return the index of the table
    return None  

In [19]:
# Create a function to crop the image elements from PDFs
def crop_image(element, pageObj, pagenum):
    # Get the coordinates to crop the image from PDF
    [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1] 
    # Crop the page using coordinates (left, bottom, right, top)
    pageObj.mediabox.lower_left = (image_left, image_bottom)
    pageObj.mediabox.upper_right = (image_right, image_top)
    # Save the cropped page to a new PDF
    cropped_pdf_writer = PyPDF2.PdfWriter()
    cropped_pdf_writer.add_page(pageObj)
    # Save the cropped PDF to a new file
    with open('cropped_image_'+ str(pagenum)+'.pdf', 'wb') as cropped_pdf_file:
        cropped_pdf_writer.write(cropped_pdf_file)

# Create a function to convert the PDF to images
def convert_to_images(input_file):
    images = convert_from_path(input_file)
    image = images[0]
    output_file = 'PDF_image' + str(pagenum)+ '.png'
    image.save('/Users/dipit.mahajan/Micro-Hive /Images/' + str(output_file), 'PNG')

# Create a function to read text from images
def image_to_text(image_path):
    # Read the image
    img = Image.open(image_path)
    # Extract the text from the image
    text = pytesseract.image_to_string(img)
    return text

In [20]:
pdf_path = 'pdfs/paper1.pdf'

# Create a pdf file object
pdfFileObj = open(pdf_path, 'rb')
# Create a pdf reader object
pdfReaded = PyPDF2.PdfReader(pdfFileObj)

In [21]:
# Create the dictionary to extract text from each image
text_per_page = {}
# Create a boolean variable for image detection
image_flag = False

# We extract the pages from the PDF
for pagenum, page in enumerate(extract_pages(pdf_path)):

    # Initialize the variables needed for the text extraction from the page
    pageObj = pdfReaded.pages[pagenum]
    page_text = []
    line_format = []
    text_from_images = []
    text_from_tables = []
    page_content = []
    # Initialize the number of the examined tables
    table_in_page= -1
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    page_tables = pdf.pages[pagenum]
    # Find the number of tables in the page
    tables = page_tables.find_tables()
    if len(tables)!=0:
        table_in_page = 0

    # Extracting the tables of the page
    for table_num in range(len(tables)):
        # Extract the information of the table
        table = extract_table(pdf_path, pagenum, table_num)
        # Convert the table information in structured string format
        table_string = table_converter(table)
        # Append the table string into a list
        text_from_tables.append(table_string)

    # Find all the elements
    page_elements = [(element.y1, element) for element in page._objs]
    # Sort all the element as they appear in the page 
    page_elements.sort(key=lambda a: a[0], reverse=True)


    # Find the elements that composed a page
    for i,component in enumerate(page_elements):
        # Extract the element of the page layout
        element = component[1]

        # Check the elements for tables
        if table_in_page == -1:
            pass
        else:
            if is_element_inside_any_table(element, page ,tables):
                table_found = find_table_for_element(element,page ,tables)
                if table_found == table_in_page and table_found != None:    
                    page_content.append(text_from_tables[table_in_page])
                    page_text.append('table')
                    line_format.append('table')
                    table_in_page+=1
                # Pass this iteration because the content of this element was extracted from the tables
                continue

        if not is_element_inside_any_table(element,page,tables):

            # Check if the element is text element
            if isinstance(element, LTTextContainer):
                # Use the function to extract the text and format for each text element
                (line_text, format_per_line) = text_extraction(element)
                # Append the text of each line to the page text
                page_text.append(line_text)
                # Append the format for each line containing text
                line_format.append(format_per_line)
                page_content.append(line_text)


            # Check the elements for images
            if isinstance(element, LTFigure):
                # Crop the image from PDF
                crop_image(element, pageObj, pagenum)
                # Convert the croped pdf to image
                convert_to_images('cropped_image_'+ str(pagenum)+'.pdf')
                # Extract the text from image
                image_text = image_to_text('Images/PDF_image' + str(pagenum)+ '.png')
                text_from_images.append(image_text)
                page_content.append(image_text)
                # Add a placeholder in the text and format lists
                page_text.append('image')
                line_format.append('image')
                # Update the flag for image detection
                image_flag = True


    # Create the key of the dictionary
    dctkey = 'Page_'+str(pagenum)
    # Add the list of list as value of the page key
    text_per_page[dctkey]= {'page_text' : page_text, 'line_format' : line_format, 'text_from_images' : text_from_images, 'text_from_tables' : text_from_tables, 'page_content': page_content}

In [10]:
text_per_page['Page_3']

{'page_text': ['2 Synchronizing jumps: A combinatorial problem\n',
  'Trading is non-synchronous at high frequencies; This leads to so-called “stale” prices at irregu-\nlarly spaced times, diﬀering across assets. Addressing asynchronicity through the coordinated\ncollection of multivariate data has been an active area of research in ﬁnancial econometrics in\nrecent years, see e.g., Barndorﬀ-Nielsen et al. (2011) or Boudt et al. (2017) and the references\ntherein, and the topic has been integral to covariance estimations since at least Epps (1979).\nNonetheless, existing sampling schemes like refresh-time sampling (Barndorﬀ-Nielsen et al.,\n2011), are not designed to handle price jumps, because stale prices are not the only cause of\nasynchronous jumps. Sometimes asset prices may be “sluggish”; news might be impounded\nin prices with a delay, although the asset is trading. To address this problem, we synchronize\nthe timing of multivariate jumps using what we call “Jump Sampling”. This 

In [163]:

#IMPORTANT CAN USE REGULAR EXPRESSION TO FIND FOOTERS AND CAN ALSO USE FONT SIZE AND TAGS (COMBINE)

import pdfplumber
import re

def extract_footers(pdf_path):
    footers = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text from the current page
            text = page.extract_text()
            if text:
                # Split text into lines
                lines = text.split('\n')
                # Iterate through lines to find footers
                for line in lines:
                    # Regular expression to match footers starting with '*' or superscript numbers
                    if re.match(r'^[\*0-9]', line):
                        footers.append(line)

    return footers

# Example usage
pdf_path = 'pdfs/paper1.pdf'
footers = extract_footers(pdf_path)
print(footers)

['1', '1 Introduction', '2017), jumps may develop gradually (Barndorff-Nielsen et al., 2009), jumps of less-liquid in-', '2', '2 Synchronizing jumps: A combinatorial problem', '2011), are not designed to handle price jumps, because stale prices are not the only cause of', '2.1 A DGP for sluggish news reactions', '1Asynchronous jumps in asset prices happen for several reasons, not just differences in liquidity. Assets', '2Gradual jumps are when the prices exhibit strong linear trends for periods of a few minutes (Barndorff-', '3', '0 0', '4', '2.2 Collecting asynchronous jumps in a jump-event matrix', '2.2.1 The spread measures sluggishness in high-frequency data', '5', '420.0', '020.0', '610.0', '210.0', '12:41 12:42 12:43 12:44 12:45 12:46 12:47 12:48 12:49', '5410.0', '5310.0', '5210.0', '12:41 12:42 12:43 12:44 12:45 12:46 12:47 12:48 12:49', '210.0', '800.0', '400.0', '000.0', '12:41 12:42 12:43 12:44 12:45 12:46 12:47 12:48 12:49', '6', '3To simplify our notation, we rely on a wei

FIND AUTHOR NAME (IMPORTANT)(SUCCESFUL) //// ALSO USING MACHINE LEARNING


In [264]:
import pdfplumber
import re

def extract_author_name(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text + " "  # Append text from each page and separate pages by space

        # Regex to find 'author' keyword and capture the following name
        pattern = r'author[:\s]*([\w\s]+)'  # Adjust this pattern based on specific PDF formatting and examples
        match = re.search(pattern, full_text, re.IGNORECASE)

        if match:
            return match.group(1).strip()  # Return the name following the 'author' keyword
        else:
            return "Author not found"

# Example usage
pdf_path = 'pdfs/paper1.pdf'
author_name = extract_author_name(pdf_path)
print("Author Name:", author_name)

Author Name: s Nabil Bouamara


In [38]:
import spacy

In [39]:
nlp = spacy.load('/Users/dipit.mahajan/anaconda3/lib/python3.11/site-packages/en_core_web_sm/en_core_web_sm-3.7.1')

In [40]:
def extract_author_name(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text + " "  # Concatenate text from each page

        # Find the position of the keyword 'author' and capture the following text
        pattern = r'Authors[:\s]*([\w\s]+)' 
        match = re.search(pattern, full_text, re.IGNORECASE)  # Search up to 200 characters after 'author'
        if match:
            # Process the captured text with spaCy
            doc = nlp(match.group(1))
            # Extract the first proper name after the keyword
            for ent in doc.ents:
                if ent.label_ in ['PERSON']:
                    return ent.text.strip()
        return "Author not found"

# Example usage
pdf_path = 'pdfs/paper1.pdf'
author_name = extract_author_name(pdf_path)
print("Author Name:", author_name)

Author Name: Nabil Bouamara


WE WILL ITERATE OVER DICTIONARY MAKE IT ONE + ADD NEW  KEY, VALUE + CHANGE IT TO JSON

In [346]:
d1 = text_per_page

d2 = page_attribute

In [347]:
def merge_nested_dicts(d1, d2):
    """
    Merge two nested dictionaries. This function will update d1 with values from d2.
    In case of overlapping keys, inner dictionaries are merged recursively.
    
    Parameters:
    d1 (dict): First dictionary with nested structure.
    d2 (dict): Second dictionary with nested structure to merge with the first dictionary.
    
    Returns:
    dict: d1 updated with merged values from d2.
    """
    for key in d2:
        if key in d1:
            if isinstance(d1[key], dict) and isinstance(d2[key], dict):
                merge_nested_dicts(d1[key], d2[key])
            else:
                # If they are not both dicts, replace the value in d1 with d2's value
                d1[key] = d2[key]
        else:
            # If the key from d2 is not in d1, add it to d1
            d1[key] = d2[key]
    return d1

final_dict = merge_nested_dicts(d1, d2)

# Printing the result
print(final_dict)


{'Page_0': {'page_text': ['image', 'ECONOMIC RESEARCH\nFEDERAL RESERVE BANK OF ST. LOUIS\n', 'WORKING PAPER SERIES\n', 'Sluggish news reactions: A combinatorial approach for\nsynchronizing stock jumps\n', 'table', 'Federal Reserve Bank of St. Louis, Research Division, P.O. Box 442, St. Louis, MO 63166\n', 'The views expressed in this paper are those of the author(s) and do not necessarily reflect the views of the Federal Reserve\n', 'System, the Board of Governors, or the regional Federal Reserve Banks. Federal Reserve Bank of St. Louis Working Papers\nare preliminary materials circulated to stimulate discussion and critical comment.\n'], 'line_format': ['image', ['QIBAAA+NimbusSans-Regular', 12.030209648640039, 'QDBAAA+TimesNewRomanPSMT', 22.55664309119993], [12.030209648640039, 'QIBAAA+NimbusSans-Regular'], [16.541538266880025, 'QIBAAA+NimbusSans-Regular'], 'table', [12.030209648639996, 'QIBAAA+NimbusSans-Regular'], [9.02265723648, 'QIBAAA+NimbusSans-Regular'], [9.02265723648, 9.0226

In [353]:
final_dict['Page_6']['text_from_images']

['Continuous part of the log price log Price\n\nDiscontinuous part of the log price\n\n0.012 0.016 0.020 0.024\n\n0.0135 0.0145\n\n0.0125\n\n0.004 0.008 0.012\n\n0.000\n\nEfficient\n— - Observed\n\nee ee ad\n\neer\nl\nl\natm\neel ee ewe!\nT T T T T T 1 1 ;\n12:41 12:42 12:43 12:44 12:45 12:46 12:47 12:48 12:49\nEfficient 2 di\n— - Observed ay Pa } a) hit)\n‘ ‘\nf ‘ag “th, 1 w\nIs il Ne\ntd\n‘\n?\ntad\ni\nT T T T T T 1 1 ;\n12:41 12:42 12:43 12:44 12:45 12:46 12:47 12:48 12:49\nEfficient ,"-..\n— - Observed pees\n\'\ni]\n\'\nl\n\'\n\'\nT T T T T T 1 1 ;\n12:41 12:42 12:43 12:44 12:45 12:46 12:47 12:48 12:49\n\n']

In [354]:
final_dict['Paper'] = 'Paper_1'

final_dict['Author'] = ['Nabil Bouamara','Kris Boudt','Sébastien Laurent','Christopher J. Neely']

final_dict['Title'] = ['Sluggish news reactions: A combinatorial approach for synchronizing stock jumps']

final_dict['Abstract'] =['Stock prices often react sluggishly to news, producing gradual jumps and jump delays. Econometricians typically treat these sluggish reactions as microstructure effects and settle for a coarse sampling grid to guard against them. Synchronizing mistimed stock returns on a fine sampling grid allows us to better approximate the true common jumps in related stock prices.']

In [355]:
len(final_dict)

34

In [356]:
import json

In [357]:
# Specify the file path
file_path = '/Users/dipit.mahajan/Micro-Hive /paper1.json'

# Open the file in write mode ('w') and write the JSON data
with open(file_path, 'w') as file:
    json.dump(final_dict, file, indent=4)


Connecting to S3 AND  dumping file and images inside the S3 bucket ( DATA LAKE)  --- FOR BACKUP AND LARGE FILE STORAGE

In [29]:
import boto3

AWS_S3_BUCKET_NAME = 'microhive'
AWS_REGION = 'eu-west-2'
AWS_ACCESS_KEY = 'AKIAW5GFZFYMCOZ6FO4W'
AWS_SECRET_KEY = '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'

LOCAL_FILE = 'paper1.json'
NAME_FOR_S3 = 'paper1.json'

def main():
    print('in main method')

    s3_client = boto3.client(
        service_name='s3',
        region_name=AWS_REGION,
        aws_access_key_id=AWS_ACCESS_KEY,
        aws_secret_access_key=AWS_SECRET_KEY
    )

    response = s3_client.upload_file(LOCAL_FILE, AWS_S3_BUCKET_NAME, NAME_FOR_S3)

    print(f'upload_log_to_aws response: {response}')

if __name__ == '__main__':
    main()

in main method
upload_log_to_aws response: None


In [24]:
import os

AWS_S3_BUCKET_NAME = 'microhive'
AWS_REGION = 'eu-west-2'
AWS_ACCESS_KEY = 'AKIAW5GFZFYMCOZ6FO4W'
AWS_SECRET_KEY = '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'

s3_client = boto3.client(
        service_name='s3',
        region_name=AWS_REGION,
        aws_access_key_id=AWS_ACCESS_KEY,
        aws_secret_access_key=AWS_SECRET_KEY
    )

def upload_folder_to_s3(local_folder, bucket_name):

  
    for root, dirs, files in os.walk(local_folder):
        for file in files:
            local_file_path = os.path.join(root, file)
            s3_file_path = os.path.relpath(local_file_path, local_folder)  # relative path for S3
            
            # Upload each file to S3 bucket
            s3_client.upload_file(local_file_path, bucket_name, s3_file_path)

# Example usage
upload_folder_to_s3('/Users/dipit.mahajan/Micro-Hive /Images', 'microhive')