# OCR - PDF to Markdown

In [1]:
# import subprocess
# import glob
# import os

# # Define the input and output directories
# input_folder = "/Users/scottwillis/Documents/Dev/hiring-challenge-ml/docs"
# output_folder = "/Users/scottwillis/Documents/Dev/hiring-challenge-ml/docs_markdown"

# # Replace spaces with underscores in PDF filenames
# pdf_files = glob.glob(f"{input_folder}*.pdf")
# for file_path in pdf_files:
#     directory, file_name = os.path.split(file_path)
#     new_file_name = file_name.replace(' ', '_')
#     new_file_path = os.path.join(directory, new_file_name)
#     os.rename(file_path, new_file_path)

# # Run subprocess of nougat
# pdf_files = glob.glob(f"{input_folder}/*.pdf")
# for pdf_file in pdf_files:
#     command = (f"nougat {pdf_file} --out {output_folder} --markdown")
#     subprocess.run(command, shell=True, check=True)

# 73min

# OCR - PDF to HTML

In [3]:
import subprocess
import glob
import os

# Define the input and output directories
input_folder = "/Users/scottwillis/Documents/Dev/hiring-challenge-ml/docs"
output_folder = "/Users/scottwillis/Documents/Dev/hiring-challenge-ml/docs_html"

# Replace spaces with underscores in PDF filenames
pdf_files = glob.glob(f"{input_folder}*.pdf")
for file_path in pdf_files:
    directory, file_name = os.path.split(file_path)
    new_file_name = file_name.replace(' ', '_')
    new_file_path = os.path.join(directory, new_file_name)
    os.rename(file_path, new_file_path)

# Run subprocess of pdftohtml with output filename based on the basename of the PDF file
pdf_files = glob.glob(f"{input_folder}/*.pdf")
for pdf_file in pdf_files:
    # Extract the basename without the extension
    base_name = os.path.splitext(os.path.basename(pdf_file))[0]
    # Construct the output file path with the modified basename
    output_file_path = os.path.join(output_folder, base_name)
    # Construct the command with the output file path
    command = f"pdftohtml -skipinvisible {pdf_file} {output_file_path}"
    subprocess.run(command, shell=True, check=True)



## Build a csv data structure from the html

In [8]:
import os
import pandas as pd
from bs4 import BeautifulSoup

input_folder = "/Users/scottwillis/Documents/Dev/hiring-challenge-ml/docs_html"
output_folder = "/Users/scottwillis/Documents/Dev/hiring-challenge-ml/docs_csv"

# Loop through each folder in the output_folder
for folder_name in os.listdir(input_folder):
    folder_path = os.path.join(input_folder, folder_name)

    # Check if it's a directory
    if os.path.isdir(folder_path):
        # Path to the index.html file
        index_file_path = os.path.join(folder_path, 'index.html')
        
        # Read the index.html file
        with open(index_file_path, 'r') as index_file:
            index_soup = BeautifulSoup(index_file, 'html.parser')
            
            # Find all links to HTML pages
            links = index_soup.find_all('a', href=True)
            
            # Initialize a list to store the data for this folder
            data = []
            
            # Loop through all found links in the index.html
            for link in links:
                html_file_path = os.path.join(folder_path, link['href'])
                
                # Open and read each HTML file
                with open(html_file_path, 'r') as file:
                    soup = BeautifulSoup(file, 'html.parser')
                    
                    # Find all HTML elements
                    for element in soup.find_all(True):
                        # Get the element type (tag name)
                        element_type = element.name
                        # Get the attributes as a dictionary
                        attributes = element.attrs
                        
                        # Extract class and style attributes
                        class_attr = attributes.get('class', [])
                        style_attr = attributes.get('style', '')
                        
                        # Get the text value of the element
                        value = element.get_text(strip=True)
                        
                        # Append the data to the list
                        data.append({
                            "ElementType": element_type,
                            "Class": class_attr,
                            "Style": style_attr,
                            "Value": value
                        })
            
            # Create a DataFrame from the list
            df = pd.DataFrame(data)
            
            # Save the DataFrame to a CSV file named after the folder
            output_csv_path = os.path.join(output_folder, f"{folder_name}.csv")
            df.to_csv(output_csv_path, index=False)

In [38]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import base64
import random

input_folder = "/Users/scottwillis/Documents/Dev/hiring-challenge-ml/docs_csv"
output_folder = "/Users/scottwillis/Documents/Dev/hiring-challenge-ml/docs_df"

# Create an empty DataFrame to store the concatenated data
final_df = pd.DataFrame(columns=['ID', 'Filename', 'Section', 'Subsection', 'Body'])

for file in os.listdir(input_folder):
    df = pd.read_csv(os.path.join(input_folder, file))
    df['Filename'] = file
    
    df = df[df['ElementType'] == 'span']

    df['Class'] = df['Class'].apply(lambda x: x.strip("[f']"))
    df = df.dropna(subset=['Class'])
    df['Class'] = df['Class'].astype(int)
    
    # Standardize the Class column
    min_class = df['Class'].min()
    df['Class'] = df['Class'].apply(lambda x: min(x - min_class + 1, 3))

    cols = ['Filename', 'Class', 'Value']
    df = df[cols]
    df = df.reset_index(drop=True)
    
    # Generate a random base64 id for each row
    df['ID'] = [base64.b64encode(os.urandom(6)).decode('utf-8') for _ in range(len(df))]
    
    # Initialize variables to store the concatenated values
    prev_class_type = 0
    section_value = ""
    subsection_value = ""
    body_value = ""
    d = dict()
    
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        class_type = row['Class']
        value = row['Value']
        
        # Concatenate the value with the current column
        if class_type == 1:
            section_value += value
        elif class_type == 2:
            subsection_value += value
        elif class_type == 3:
            body_value += value
    
        # Assign the concatenated values to the corresponding columns
        # When triggered by Class change or end of Body
        if class_type != prev_class_type or (class_type == 3 and (body_value.endswith(';') or body_value.endswith('.'))):
            if class_type == 1:
                d['Section'] = section_value
            elif class_type == 2:
                d['Subsection'] = subsection_value
            elif class_type == 3:
                d['Body'] = body_value
            
            if len(d.keys()) == 3:
                d['ID'] = row['ID']
                d['Filename'] = row['Filename']
                final_df = pd.concat([final_df, pd.DataFrame([d])], ignore_index=True)
                body_value = ""
                prev_class_type = 0
                d.pop('Body', None)
                d.pop('ID', None)
                d.pop('Filename', None)
        
        prev_class_type = class_type
    
# Save the final DataFrame to a CSV file
output_csv_path = os.path.join(output_folder, "data.csv")
final_df.to_csv(output_csv_path, index=False)