# Parsing documents into HTML

In [2]:
from langchain_community.document_loaders import PDFMinerPDFasHTMLLoader
import os 

In [19]:
loader = PDFMinerPDFasHTMLLoader(os.path.join(os.getcwd(), "documents", "document_2.pdf") )
data = loader.load()[0]   # entire PDF is loaded as a single Document

# save to a html file
file_name = 'content.html'
file_path = os.path.join(os.getcwd(), file_name)
with open(file_path, "w") as f:
    f.write(data.page_content)

print(data.page_content[:1000])

<html><head>
<meta http-equiv="Content-Type" content="text/html">
</head><body>
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:46px; top:173px; width:119px; height:37px;"><span style="font-family: Helvetica-Bold; font-size:21px">Tesla Inc.  
<br></span><span style="font-family: Helvetica-Bold; font-size:14px">(NASDAQ: TSLA) 
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:46px; top:227px; width:98px; height:15px;"><span style="font-family: Helvetica-Bold; font-size:15px">Fully Valued 
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:471px; top:108px; width:70px; height:12px;"><span style="font-family: Helvetica-Bold; font-size:12px">May 2, 2023 
<br></span></div><div st

## Segmenting Tags and Contents

In [20]:
import re
def extract_tags(data):
    pattern = re.compile(r'<[\w+?+]>|<(\w+?)\b')
    matches = pattern.findall(data)
    
    return set(matches)

print(f"INFO avaliable tags from document:")
tags = extract_tags(data.page_content)
print(f"Tags: {tags}")

INFO avaliable tags from document:
Tags: {'html', 'meta', 'div', 'a', 'br', 'body', 'head', 'span'}


## Html parsing with BeautifulSoup

In [21]:
from bs4 import BeautifulSoup

def extract_content_for_tags(page_content, tags):
    soup = BeautifulSoup(page_content, 'html.parser')

    content_for_tags = {}
    for tag in tags:
        tag_occurrences = soup.find_all(tag)
        
        content_list = []
        for occurrence in tag_occurrences:
            content_list.append(occurrence.get_text())
        
        content_for_tags[tag] = content_list
    
    return content_for_tags

content_for_tags = extract_content_for_tags(data.page_content, tags)
for k, v in content_for_tags.items():
    print(f"{k}")
    print(f"{v}")

html
['\n\n\n\nPage 1\nTesla Inc.  \n(NASDAQ: TSLA) \nFully Valued \nMay 2, 2023 \nSid Rajeev, B.Tech, CFA, MBA \nHead of Research  \nAlexis Cabel, BAEcon \nEquity Analyst \nSector/Industry: Auto Manufacturers                        Click here for more research on the company and to share your views  \nHighlights\n➢\nIn Q1-2023, TSLA’s revenue rose 24% YoY, beating our estimate by 6%, due to stronger than expected \nsales. However, EPS fell 23% YoY (2% below our forecast) due to margin erosion (gross margins were \ndown 10 ppt YoY) from higher raw material/manufacturing/logistics costs, and as Tesla has been slashing \nvehicle prices (down 9% YoY).  \n➢ TSLA is down 53% since our initiating report in April 2022, wherein we had predicted 39% downside \nrisk.\n➢ According to the  International Energy Agency (IEA), 10.3M Electric Vehicles (EV) were sold in 2022 (up \n49% YoY, accounting for 14% of global vehicle sales). The IEA is forecasting 35% YoY growth in 2023, \naccounting for 18% o

The content is distributed across various tags so it is difficult to identify the most important segment. 
One must have some domain knowledge to find which portion is relevant to the problem statement. 

One brute force solution can be to feed the data into LLM and let it figure out the important itself but that is not an efficient solution due to the token limitations.