In [None]:
import pandas as pd
import tabula
import fitz
from markitdown import MarkItDown
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
from openai import OpenAI
from dotenv import dotenv_values
import os
import re
openai_key = dotenv_values(".env")["OPENAI_KEY"]

In [2]:
doc = fitz.open('pura073552.ww.pdf')

page_range = range(35, 50)

for page_num in page_range:
	page = doc.load_page(page_num)
	
	new_page_pdf = fitz.open()
	new_page_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)
	page_path = f'pages/temp_{page_num}.pdf'
	new_page_pdf.save(page_path)
	new_page_pdf.close()

doc.close()

client = OpenAI(api_key=openai_key)
md = MarkItDown(llm_client=client, llm_model="gpt-4o")

files = sorted(list(map(lambda x: os.path.join("pages", x), os.listdir("pages"))))

tables_text = ""

for file in files:
	text_result = md.convert(file).text_content
	tables_text += text_result + "\n\n"

tables_text = tables_text.replace("Appendix 11. Characteristics of included studies\n\nTable A. Characteristics of included studies (n=126 studies; 338 datapoints)\n\n", "")
tables_text = tables_text.replace("’", "'")
tables_text = tables_text.replace('"', '""')

In [None]:
table_format = """
Author and Year;Study Design;Study Period;Country;Equity;Participants;Mean Age [Range];Risk of Bias;Number of dp;MA?;Exposure;Exposure Measure;Outcome Measure;N
Anastario 2020;Cross-sectional;NR;USA;High income country;"Youth attending 5 schools located on or near a tribal reservation in Montana";"15.7 [14-18]";Mod;2;Yes;"Freq. of SM use";"Freq. of using Twitter to talk or learn about sex or any topic related to sex";"No use of a condom at last sexual encounter";146
Anastario 2020;Cross-sectional;NR;USA;High income country;;"15.7 [14-18]";Mod;2;Yes;"Freq. of SM use";"Freq. of using Facebook to talk or learn about sex or any topic related to sex";"No use of a condom at last sexual encounter";146
Baker 2016;Cross-sectional;2009;USA;High income country;"Grade 6-12 urban school district students part of a federally funded project on school related initiatives";NR;High;3;Yes;"Freq. of SM use";"Freq. of SNS use";"Soft drug use (smoking, marijuana, alcohol) in the past month";3195
Baker 2016;Cross-sectional;2009;USA;High income country;;NR;High;3;Yes;"Freq. of SNS use";"Hard drug use (lifetime and past year)";"";3195
"""

stream = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "assistant",
            "content": "You are a highly intelligent and detail-oriented assistant, capable of processing and formatting textual data into structured formats like CSV. Your role is to ensure that the output follows the user's exact specifications, preserving data integrity and adhering to formatting guidelines."
        },
        {
            "role": "user",
            "content": f"""
Your task:
You must format the following raw data into a valid CSV using this structure:
{table_format}

Instructions:
1. Convert the provided text into CSV, ensuring all fields follow the structure above.
2. Format all data into a single continuous table without blank lines between rows or skipped rows.
3. Use ";" as the delimiter for fields. Ensure proper escaping of quotes as required for CSV format (e.g., escape internal quotes with double quotes `""`).
4. Replicate the data for any rows that reuse fields from previous rows.

Here is the data to process:
{tables_text}

The output must strictly follow the formatting rules described above. If you encounter inconsistencies, process the data as is, maintaining its structure and adhering to the CSV format rules.
"""
        }
    ],
    stream=True,
)

final_result = ""
for chunk in stream:
    if chunk.choices[0].delta.content is not None:
        final_result += chunk.choices[0].delta.content
        print(chunk.choices[0].delta.content, end="")


In [13]:
csv = re.search(r"```csv(.*)```", final_result, re.DOTALL).group(1)
csv = csv.strip()

with open("output_dataset/bmj_tables.csv", "w") as f:
	f.write(csv)