<a href="https://colab.research.google.com/github/giustinod/Fine-Tuning-Llama-2LLM/blob/main/Unstructured_PDF_to_HF_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

cfr. https://groff.dev/blog/extracting-insights-from-articles-using-groq-and-llama-31


In [None]:
!pip install -U pymupdf4llm
!pip install -U groq
!pip install -U unstructured
!pip install -U langchain-community
!pip install -U nltk
!pip install --upgrade pyarrow
!pip install --upgrade datasets

from datasets import Dataset
from datasets import load_dataset

import nltk
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

import os
import re
import json
import pathlib
import pymupdf4llm
from google.colab import userdata
from groq import Groq
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

client = Groq(
    api_key = userdata.get('GROQ_API_KEY'),
)

hf_token = userdata.get('HF_TOKEN'),

qa_schema = pa.schema([
  ('id', pa.int64()),
  ('premises', pa.string()),
  ('conclusion', pa.string())
])

def getDataframeFromMarkdown(inputPath):
  # Using Groq API to build pandas dataframe from markdown file
  loader = UnstructuredMarkdownLoader(inputPath)
  documents = loader.load()
  sentence_text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = 3000,
      chunk_overlap = 200,
      separators=["\n\n", "\n", " "]
  )
  split_sentences = sentence_text_splitter.split_documents(documents)

  purpose = """
  # Your Purpose
  You will try to summarize the document content.

  # Output Format - text
  You will respond with descriptions of objects, variables, states, rules, transitions, etc...

  {
    "premises": "<The definition of an object or a variable or a rule or a transition>",
    "conclusion": "<The description eventually including examples>"
  }
  """
  qid = 0
  df = pd.DataFrame(columns=['id', 'premises', 'conclusion'])

  for sentence in split_sentences:
      md_text = sentence.page_content
      completion = client.chat.completions.create(
          model="llama-3.1-70b-versatile",
          messages=[
              {
                  "role": "system",
                  "content": purpose
              },
              {
                  "role": "user",
                  "content": md_text
              }
          ],
          # response_format={"type": "json_object"},
          temperature = 0,
          max_tokens = 8000,
          top_p = 1,
          stream = False,
          stop = None,
      )
      # json_output = json.loads(completion.choices[0].message.content)
      # print(json_output)
      qid = qid + 1
      # Extract the JSON string using regular expression
      match = re.search(r'\{(.*?)\}', completion.choices[0].message.content, re.DOTALL)
      if match:
          json_string = re.sub(r'[\x00-\x1f]', '', match.group(0))
          output = json.loads(json_string) # Load the extracted JSON string
          df = pd.concat([df, pd.DataFrame([{'id': qid, 'premises': output['premises'], 'conclusion': output['conclusion']}])], ignore_index=True)

  return df

def pushDatasetData(df, filename):
  # Push dataset on HuggingFace
  dataset = Dataset.from_pandas(df)
  # dataset  = dataset.train_test_split(test_size=0.3)
  dataset.push_to_hub("giustinod/" + filename, token = hf_token)

drive_path = "/content/drive/MyDrive/Colab Notebooks/data/"

pathlist = pathlib.Path(drive_path).rglob('*.pdf')
idx = 0
for path in pathlist:
    # because path is object not string
    idx = idx + 1
    md_text = pymupdf4llm.to_markdown(str(path))
    filename = path.name.split(".", 1)[0]
    print('Processing ' + filename)
    output_path = drive_path + filename + '.md'
    pathlib.Path(output_path).write_bytes(md_text.encode())
    print('Markdown ok')
    df = getDataframeFromMarkdown(output_path)
    print('DataFrame ok')
    pushDatasetData(df, filename)
    print('Pushed on HF')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Processing /content/drive/MyDrive/Colab Notebooks/data/RouteLocking.pdf...
Processing RouteLocking
Markdown ok
