# Structuring PDF content with Polars

In [1]:
import fitz 
import os 
from collections import defaultdict

In [2]:
# extract pdf content
def get_pdf_content(pdf_path):
    pdf = fitz.open(pdf_path)
    content = defaultdict(list)
    
    for page in pdf:
        blocks = page.get_text().split("\n \n")

        for b_num, block in enumerate(blocks):
            
            for i, sentence in enumerate(block.split("\n")):
                # print(sentence)
                if sentence.strip():
                    content["sentence"].append(sentence)
                    content["line_num"].append(i + 1)
                    content['block'].append(b_num + 1) # b_num
                    content['page'].append(page.number + 1)

    return content


documents_dir = os.path.join(os.getcwd(), "documents") 
content = get_pdf_content(os.path.join(documents_dir, "document_1.pdf"))

In [4]:
content.items()

dict_items([('sentence', ['DISCLOSURE APPENDIX AT THE BACK OF THIS REPORT CONTAINS IMPORTANT DISCLOSURES, ANALYST ', 'CERTIFICATIONS, AND THE STATUS OF NON-US ANALYSTS.  US Disclosure: Credit Suisse does and seeks to do ', 'business with companies covered in its research reports. As a result, investors should be aware that the Firm may have a ', 'conflict of interest that could affect the objectivity of this report. Investors should consider this report as only a single factor in ', 'making their investment decision.  ', 'CREDIT SUISSE SECURITIES RESEARCH & ANALYTICS ', 'BEYOND INFORMATION® ', 'Client-Driven Solutions, Insights, and Access ', '13 August 2014 ', 'Americas/United States ', 'Equity Research ', 'Auto Parts & Equipment / Automobile Manufacturers  ', 'Tesla Motors Inc. (TSLA) ', ' INITIATION  ', ' Not A Fair Fight ', 'We are initiating coverage of Tesla Motors (TSLA) with an Outperform rating ', 'and a $325 target price.  For a complete copy of our initiation report, please 

In [7]:
## convert dict to polars dataframe
import polars as pl 
df = pl.DataFrame(content)
df.head()

sentence,line_num,block,page
str,i64,i64,i64
"""DISCLOSURE APPENDIX AT THE BAC…",2,1,1
"""CERTIFICATIONS, AND THE STATUS…",3,1,1
"""business with companies covere…",4,1,1
"""conflict of interest that coul…",5,1,1
"""making their investment decisi…",6,1,1


## Cleaning text with block removal

In [8]:
## remove every block whose sentence has "disclaimers" or "discloures"
noise = ["disclosure", "disclaimer", "appendix"]

def remove_noise(df, noise):
    df = df.with_columns(pl.col('sentence').str.to_lowercase().alias("lower_sentence"))
    df = df.with_columns(
        pl.col("lower_sentence").str.contains("| ".join(noise)).alias("has_disclaimer")
    )
    df_blocks = df.filter(pl.col("has_disclaimer"))
    return df.filter(~pl.col("block").is_in(df_blocks['block'].to_list()))

remove_noise(df, noise)

sentence,line_num,block,page,lower_sentence,has_disclaimer
str,i64,i64,i64,str,bool
"""13 August 2014 """,1,3,1,"""13 august 2014 """,false
"""Americas/United States """,2,3,1,"""americas/united states """,false
"""Equity Research """,3,3,1,"""equity research """,false
"""Auto Parts & Equipment / Autom…",4,3,1,"""auto parts & equipment / autom…",false
"""Tesla Motors Inc. (TSLA) """,2,4,1,"""tesla motors inc. (tsla) """,false
…,…,…,…,…,…
"""Ford Motor Company (F.N, $17.2…",4,4,2,"""ford motor company (f.n, $17.2…",false
"""General Motors Corp. (GM.N, $3…",5,4,2,"""general motors corp. (gm.n, $3…",false
"""Tesla Motors Inc. (TSLA.OQ, $2…",6,4,2,"""tesla motors inc. (tsla.oq, $2…",false
"""Toyota Motor (7203.T, ¥5,990) """,7,4,2,"""toyota motor (7203.t, ¥5,990) """,false


In [9]:
## merge the sentences 
filtered_df = remove_noise(df, noise)
filtered_df = filtered_df.group_by("page").agg(pl.col("sentence"), pl.col("block"))
filtered_df.head()

page,sentence,block
i64,list[str],list[i64]
1,"[""13 August 2014 "", ""Americas/United States "", … ""shreyas.patil@credit-suisse.com ""]","[3, 3, … 13]"
2,"[""Companies Mentioned (Price as of 12-Aug-2014) "", ""BMW (BMWG.DE, €87.2) "", … ""Volkswagen (VOWG_p.DE, €167.7)""]","[4, 4, … 4]"


In [10]:
# combine sentences with \n for each block 
filtered_df = filtered_df.with_columns(
    pl.col("sentence").list.join("\n").over("block").alias("sentences_per_block")
)
filtered_df.head()

page,sentence,block,sentences_per_block
i64,list[str],list[i64],str
1,"[""13 August 2014 "", ""Americas/United States "", … ""shreyas.patil@credit-suisse.com ""]","[3, 3, … 13]","""13 August 2014 Americas/Unite…"
2,"[""Companies Mentioned (Price as of 12-Aug-2014) "", ""BMW (BMWG.DE, €87.2) "", … ""Volkswagen (VOWG_p.DE, €167.7)""]","[4, 4, … 4]","""Companies Mentioned (Price as …"


In [13]:
content = ""
for block in filtered_df['sentences_per_block']:
    content += block
    content += "\n \n"
print(f"Content:\n-------\n{content}\n-----------") 

Content:
-------
13 August 2014 
Americas/United States 
Equity Research 
Auto Parts & Equipment / Automobile Manufacturers  
Tesla Motors Inc. (TSLA) 
 INITIATION  
 Not A Fair Fight 
We are initiating coverage of Tesla Motors (TSLA) with an Outperform rating 
and a $325 target price.  For a complete copy of our initiation report, please 
click here. 
To download a copy of our broader Autos & Auto Parts sector initiation report, 
please click here 
■ EV’s are inherently better than Internal Combustion (ICE) vehicles. We 
believe that there are many inherent advantages to Electric Vehicles, and 
the success of the Model S (Tesla’s first fully-developed vehicle) goes a long 
way to proving that. The company has many competitive advantages in this 
technology that will be difficult for the traditional automakers to close, 
particularly as the industry’s focus must remain on hitting very challenging 
fuel economy regulations. If Tesla can get to cost-parity with ICE and still 
offer  $1,4