<a href="https://colab.research.google.com/github/giustinod/Fine-Tuning-Llama-2LLM/blob/main/Unstructured_PDF_to_HF_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

cfr. https://groff.dev/blog/extracting-insights-from-articles-using-groq-and-llama-31


In [1]:
!pip install -U pymupdf4llm
!pip install -U groq
!pip install -U unstructured
!pip install -U langchain-community
!pip install -U nltk
!pip install --upgrade pyarrow
!pip install --upgrade datasets

from datasets import Dataset
from datasets import load_dataset

import nltk
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

import os
import re
import json
import pathlib
import pymupdf4llm
from google.colab import userdata
from groq import Groq
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

client = Groq(
    api_key = userdata.get('GROQ_API_KEY'),
)

hf_token = userdata.get('HF_TOKEN')

qa_schema = pa.schema([
  ('id', pa.int64()),
  ('premises', pa.string()),
  ('conclusion', pa.string())
])

def getDataframeFromMarkdown(inputPath):
  # Using Groq API to build pandas dataframe from markdown file
  loader = UnstructuredMarkdownLoader(inputPath)
  documents = loader.load()
  sentence_text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = 3000,
      chunk_overlap = 200,
      separators=["\n\n", "\n", " "]
  )
  split_sentences = sentence_text_splitter.split_documents(documents)

  purpose = """
  # Your Purpose
  You will try to summarize the document content.

  # Output Format - text
  You will respond with descriptions of objects, variables, states, rules, transitions, etc...

  {
    "premises": "<The definition of an object or a variable or a rule or a transition>",
    "conclusion": "<The description eventually including examples>"
  }
  """
  qid = 0
  df = pd.DataFrame(columns=['id', 'premises', 'conclusion'])

  for sentence in split_sentences:
      md_text = sentence.page_content
      completion = client.chat.completions.create(
          model="llama-3.1-70b-versatile",
          messages=[
              {
                  "role": "system",
                  "content": purpose
              },
              {
                  "role": "user",
                  "content": md_text
              }
          ],
          # response_format={"type": "json_object"},
          temperature = 0,
          # max_tokens = 8000,
          # top_p = 1,
          # stream = False,
          # stop = None,
      )
      # json_output = json.loads(completion.choices[0].message.content)
      # print(json_output)
      qid = qid + 1
      # Extract the JSON string using regular expression
      match = re.search(r'\{(.*?)\}', completion.choices[0].message.content, re.DOTALL)
      if match:
          json_string = re.sub(r'[\x00-\x1f]', '', match.group(0))
          output = json.loads(json_string) # Load the extracted JSON string
          df = pd.concat([df, pd.DataFrame([{'id': qid, 'premises': output['premises'], 'conclusion': output['conclusion']}])], ignore_index=True)

  return df

def pushDataset(df, filename):
  # Push dataset on HuggingFace
  dataset = Dataset.from_pandas(df)
  # dataset  = dataset.train_test_split(test_size=0.3)
  dataset.push_to_hub("giustinod/" + filename, token = hf_token)

drive_path = "/content/drive/MyDrive/Colab Notebooks/data/"

pathlist = pathlib.Path(drive_path).rglob('*.pdf')
idx = 0
for path in pathlist:
    # because path is object not string
    idx = idx + 1
    md_text = pymupdf4llm.to_markdown(str(path))
    ds_name = path.name.split(".", 1)[0]
    print('Processing ' + ds_name)
    md_path = drive_path + ds_name + '.md'
    pathlib.Path(md_path).write_bytes(md_text.encode())
    print('Markdown ok')
    df = getDataframeFromMarkdown(md_path)
    df.to_parquet(drive_path + ds_name + ".parquet")
    print('DataFrame ok')
    # pushDataset(df, ds_name)
    # print('Pushed on HF')

Collecting pymupdf4llm
  Downloading pymupdf4llm-0.0.17-py3-none-any.whl.metadata (4.1 kB)
Collecting pymupdf>=1.24.10 (from pymupdf4llm)
  Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf4llm-0.0.17-py3-none-any.whl (26 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf, pymupdf4llm
Successfully installed pymupdf-1.24.13 pymupdf4llm-0.0.17
Collecting groq
  Downloading groq-0.12.0-py3-none-any.whl.metadata (13 kB)
Downloading groq-0.12.0-py3-none-any.whl (108 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.9/108.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.12.0
Collecting unstructured
  Downloading unstructu

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Processing /content/drive/MyDrive/Colab Notebooks/data/ILL2_CF650_RSA_Variables.pdf...
Processing ILL2_CF650_RSA_Variables
Markdown ok


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.1-70b-versatile` in organization `org_01jbw3tcs4ftare3w1d6case49` on : Limit 200000, Used 199829, Requested 779. Please try again in 4m22.547s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}}

In [None]:
from datasets import load_dataset

pathlist = pathlib.Path(drive_path).rglob('*.parquet')
for path in pathlist:
    # because path is object not string
    ds_name = path.name.split(".", 1)[0]
    print('Processing ' + ds_name)
    dataset = load_dataset("parquet", data_files={'train': drive_path + ds_name + ".parquet"})
    dataset.push_to_hub("giustinod/" + ds_name, token = hf_token, private=True)
    print('Pushed on HF')

Processing RouteProtection


Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/339 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Pushed on HF
Processing partialSSRS


Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushed on HF
Processing RouteLocking


Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushed on HF


Processing markdown file without AI

In [37]:
# Proviamo a verificare cosa succede
!pip install --upgrade datasets

import re
import pandas as pd
from pathlib import Path
from datasets import Dataset
from datasets import load_dataset
from google.colab import userdata

drive_path = "/content/drive/MyDrive/Colab Notebooks/data/"

ds_name = "ILL2_CF650_RSA_Variables"

source_file = Path(drive_path + ds_name + ".md")

type = ''
parameters = ''
valueRange = ''
link = ''
inDescription = False
description = ''
inValues = False
valueRange = ''

df = pd.DataFrame(columns=['id', 'premises', 'conclusion'])

idx = 0
with open(source_file) as f:
  for line in f:
    if f'## ' in line:
      premise = 'Variable ' + line[3:].replace('; ', ' is ').replace('\n', '')
    if inValues and not f'Link:' in line:
      valueRange = valueRange + line
    if inDescription and not f'Department Document number' in line:
      description = description + line
    if inDescription and f'Department Document number' in line:
      conclusion = type + ', Parameters: ' + parameters.replace('\n', '') + ', Values range: ' + valueRange.replace('\n\n', '') + ', Description: ' + description.replace('\n', '')
      newRow = [idx, premise, conclusion]
      new = pd.DataFrame(columns=df.columns, data=[newRow])
      df = pd.concat([df, new], axis=0)
      idx = idx + 1
      inDescription = False
      description = ''
      inDescription = False
    if f'**Type: ' in line:
      chunks = line.split('**')
      type = str(chunks[1]) + ', ' + str(chunks[3]) + ', ' + str(chunks[5])
    if f'**Parameters:**' in line:
      parameters = str(line.split('**')[2])
    if f'**Value range:**' in line:
      inValues = True
      valueRange = line.replace('**Value range:**', '')
    if f'**Link:' in line:
      inValues = False
      # link = line.split('**')
      # print(link)
    if f'**Description:**' in line:
      inDescription = True

hf_token = userdata.get('HF_TOKEN')
df.to_parquet(drive_path + ds_name + ".parquet")
print('DataFrame ok')
# Push dataset on HuggingFace
dataset = Dataset.from_pandas(df)
dataset.push_to_hub("azservice/" + ds_name, token = hf_token)
print('Pushed on HF')

DataFrame ok


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/391 [00:00<?, ?B/s]

Pushed on HF
