## Creating Custom Dataset for PEFT Training

Using Llamaindex RagDatasetGenerator to create a custom data set based on the Actuarial Standards of Practice.

Install relevant packages

In [None]:
%%capture
!pip install requests
!pip install beautifulsoup4
!pip install PyPDF2
!pip install langchain_community
!pip install langfuse
!pip install tiktoken
!pip install langchain-openai langchain-core langgraph langchain-chroma
!pip install langsmith
!pip install llama-index
!pip install llama-index-llms-huggingface
!pip install llama-index-llms-huggingface-api
!pip install openai
!pip install httpx==0.27.2
!pip install llama-index-llms-openai
!pip install llama-index-llms-ollama
!pip install datasets


Download ASOPs 1 through 57 as well as additional information, such as definitions, procedures for use, and clarifications for specific cases.

In [None]:
import requests
from bs4 import BeautifulSoup
import os

# Create a directory to store the PDFs
os.makedirs('asop_pdfs', exist_ok=True)

# URLs of the pages to scrape
urls_to_scrape = [
    'http://www.actuarialstandardsboard.org/standards-of-practice/',
    'http://www.actuarialstandardsboard.org/other-documents/'
]

# Set to store unique PDF links
pdf_links = set()

for page_url in urls_to_scrape:
    print(f'Scraping page: {page_url}')
    # Get the page content
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all links to PDFs on the page
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.endswith('.pdf'):
            if href.startswith('/'):
                href = f'http://www.actuarialstandardsboard.org{href}'
            pdf_links.add(href)

print(f'Found {len(pdf_links)} unique PDF links.')

# Download each PDF
for url in pdf_links:
    pdf_name = url.split('/')[-1]
    pdf_path = os.path.join('asop_pdfs', pdf_name)
    if not os.path.exists(pdf_path):
        print(f'Downloading {pdf_name}...')
        pdf_response = requests.get(url)
        with open(pdf_path, 'wb') as f:
            f.write(pdf_response.content)
    else:
        print(f'{pdf_name} already exists. Skipping download.')

print('All PDFs downloaded.')


Scraping page: http://www.actuarialstandardsboard.org/standards-of-practice/
Scraping page: http://www.actuarialstandardsboard.org/other-documents/
Found 70 unique PDF links.
Downloading asop018_206-.pdf...
Downloading New-ASB-Definitions-updated-12-5-2024.pdf...
Downloading asop032_196.pdf...
Downloading asop049_179.pdf...
Downloading asop017_192.pdf...
Downloading asop038_201.pdf...
Downloading asop044_160.pdf...
Downloading asop034_180.pdf...
Downloading framework6.29.16.pdf...
Downloading asop035_198.pdf...
Downloading asop005_186.pdf...
Downloading asop003_202-1.pdf...
Downloading asop053_190.pdf...
Downloading asop043_159.pdf...
Downloading asop027_197.pdf...
Downloading asop055_194.pdf...
Downloading Standards-Library-TOC_October-2024.pdf...
Downloading asop012_132.pdf...
Downloading asop050_182.pdf...
Downloading asop037_154.pdf...
Downloading asop011__199.pdf...
Downloading financial_067.pdf...
Downloading asop031_115.pdf...
Downloading asop013_133.pdf...
Downloading asop010_2

Extract information from the pdf files

In [None]:
import PyPDF2

# Create a directory to store the text files
os.makedirs('asop_texts', exist_ok=True)

pdf_folder = 'asop_pdfs'

# Get list of all PDF files downloaded
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    text_output_path = os.path.join('asop_texts', pdf_file.replace('.pdf', '.txt'))
    if os.path.exists(text_output_path):
        print(f'Text for {pdf_file} already extracted. Skipping.')
        continue
    print(f'Extracting text from {pdf_file}...')
    try:
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ''
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    text += page_text
        # Save the extracted text to a file
        with open(text_output_path, 'w', encoding='utf-8') as text_file:
            text_file.write(text)
    except Exception as e:
        print(f'Error extracting text from {pdf_file}: {e}')

print('Text extraction completed.')


Extracting text from asop001_170.pdf...
Extracting text from asop043_159.pdf...
Extracting text from asop044_160.pdf...
Extracting text from asop024_217.pdf...
Extracting text from asop040_212-1.pdf...
Extracting text from asop022_203.pdf...
Extracting text from asop009_105.pdf...
Extracting text from deviation_161.pdf...
Extracting text from asop027_211.pdf...
Extracting text from asop057_208.pdf...
Extracting text from asop048_1751.pdf...
Extracting text from asop011__199.pdf...
Extracting text from asop039_156.pdf...
Extracting text from asop012_132.pdf...
Extracting text from asop031_115.pdf...
Extracting text from opinions3and4_088.pdf...
Extracting text from asop007_128.pdf...
Extracting text from ASOP-Proposal-Form.pdf...
Extracting text from asop041_120.pdf...
Extracting text from asop038_201.pdf...
Extracting text from asop034_180.pdf...
Extracting text from asop023_185.pdf...
Extracting text from asop036_213.pdf...
Extracting text from Standards-Library-TOC_October-2024.pdf..

Load the extracted texts

In [None]:
import os

# Directory containing the extracted text files
text_dir = 'asop_texts'

# List all text files
text_files = [f for f in os.listdir(text_dir) if f.endswith('.txt')]

# Initialize a list to hold all documents
documents = []

for file_name in text_files:
    file_path = os.path.join(text_dir, file_name)
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
        documents.append(text)


Clean each document

In [None]:
import re

def clean_text(text):
    # Remove headers and footers (customize patterns as needed)
    text = re.sub(r'ASOP No\.\s*\d+\s*-.*?\n', '', text)  # Remove ASOP headers
    text = re.sub(r'Actuarial Standards Board.*?\n', '', text)  # Remove footer
    text = re.sub(r'Page\s*\d+\s*of\s*\d+', '', text)  # Remove page numbers

    # Remove multiple newlines and excessive whitespace
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)

    # Strip leading and trailing whitespace
    text = text.strip()

    return text


In [None]:
cleaned_documents = [clean_text(doc) for doc in documents]


In [None]:
cleaned_documents[4][0:1000]

'Actuarial Standard \nof Practice \nNo. 53 \n \n \n \n \nEstimating Future Costs for Prospective \nProperty/Casualty Risk Transfer and Risk Retention \n \n \n \n \n \n \nDeveloped by the \nRatemaking Task Force of the \nCasualty Committee of the \n \n \nAdopted by the \nDecember 2017 \n \nDoc. No. 190\nASOP No. 53—Doc. No. 190 \nii \n T A B L E O F C O N T E N T S \n \n \nTransmittal Memorandum iv \n \nSTANDARD OF PRACTICE \n \nSection 1. Purpose, Scope, Cross References, and Effective Date 1 \xa0\n1.1\xa0Purpose 1 \xa0\n1.2\xa0Scope 1 \xa0\n1.3\xa0Cross References 1 \xa0\n1.4\xa0Effective Date 2 \xa0\n Section 2. Definitions 2\n\xa0\n2.1\xa0Coverage 2 \xa0\n2.2\xa0Exposure Base 2 \xa0\n2.3\xa0Method 2 \xa0\n2.4\xa0Model 2 \xa0\n2.5\xa0Risk Retention 2 \xa0\n2.6\xa0Risk Transfer 2 \xa0\n Section 3. Analysis of Issues and Recommended Practices 2\n\xa0\n3.1\xa0Future Cost Estimate 2 \xa0\n3.2\xa0Intended Measure 2 \xa0\n3.3\xa0Organization of Data 3 \xa0\n3.4\xa0Data Quality 3 \xa0\n3.5\

In [None]:
import os
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

In [None]:
from llama_index.llms.openai import OpenAI
llm = OpenAI(model="gpt-4o-mini", temperature=0.3)


In [None]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing_extensions import List, TypedDict

class LangChainDocument(TypedDict):
    page_content: str
    metadata: dict

langchain_docs = [Document(page_content=doc) for doc in cleaned_documents]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(langchain_docs)


In [None]:
# attach to the same event-loop
import nest_asyncio

nest_asyncio.apply()

import logging
import sys

# Set up the root logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)  # Set logger level to INFO

# Clear out any existing handlers
logger.handlers = []

# Set up the StreamHandler to output to sys.stdout (Colab's output)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)  # Set handler level to INFO

# Add the handler to the logger
logger.addHandler(handler)

In [None]:
#import os
from google.colab import userdata
os.environ['HUGGINGFACEHUB_API_TOKEN'] = userdata.get('HFWRITE_API_KEY')
HUGGINGFACEHUB_API_TOKEN = os.environ['HUGGINGFACEHUB_API_TOKEN']
HF_TOKEN = os.environ['HUGGINGFACEHUB_API_TOKEN']

In [None]:
from huggingface_hub import login

# Login explicitly with your token
login(token=HF_TOKEN)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


I split the documents up into 7 different groups for the question answer generation. Manually change the batch number. This was done because I was running this on Colab and couldn't babysit the execution for 3 hours.

In [None]:
from datasets import Dataset
from llama_index.core import Document
from llama_index.core.llama_dataset.generator import RagDatasetGenerator

# Convert cleaned strings into Document objects
cleaned_documents_objects = [
    Document(text=doc, id_=str(idx)) for idx, doc in enumerate(cleaned_documents)
]

# Counter for dataset batch
batch_number = 1 # Change this manually for each run (1 to 7)

# Calculate the document range based on the batch number
start_idx = (batch_number - 1) * 10
end_idx = batch_number * 10

# Select documents for this batch
batch_documents = cleaned_documents_objects[start_idx:end_idx]

# Create the RAG Dataset Generator
dataset_generator = RagDatasetGenerator.from_documents(
    documents=batch_documents,
    llm=llm,
    num_questions_per_chunk=10,  # Generate 10 questions per chunk
)

# Generate the dataset
print(f"Processing batch {batch_number} with documents {start_idx} to {end_idx - 1}...")
rag_dataset = dataset_generator.generate_dataset_from_nodes()

# Save the RAG dataset to a JSON file
output_file = f"rag_dataset_batch_{batch_number}.json"
rag_dataset.save_json(output_file)
print(f"Batch {batch_number} saved locally to {output_file}.")



Processing batch 7 with documents 60 to 69...
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 20

In [None]:
import json

# Initialize a list to store all the combined data
combined_data = []

# Loop through the dataset files
for i in range(1, 8):
    file_name = f"rag_dataset_batch_{i}.json"
    try:
        # Open and load the JSON file
        with open(file_name, 'r') as file:
            data = json.load(file)
            # Assuming the key is 'examples', extract its value
            if "examples" in data:
                combined_data.extend(data["examples"])  # Add the examples to the combined list
            else:
                print(f"Warning: No 'examples' key found in {file_name}.")
    except FileNotFoundError:
        print(f"Error: {file_name} not found.")
    except json.JSONDecodeError:
        print(f"Error: {file_name} contains invalid JSON.")

# Save the combined data into a single JSON file
output_file = "combined_rag_dataset.json"
try:
    with open(output_file, 'w') as file:
        json.dump({"examples": combined_data}, file, indent=4)
    print(f"Combined dataset saved as {output_file}.")
except Exception as e:
    print(f"Error saving combined dataset: {e}")


Combined dataset saved as combined_rag_dataset.json.


In [None]:
combined_data[21]

{'query': 'What types of actuaries are primarily impacted by the standard discussed in the document?',
 'query_by': {'model_name': 'gpt-4o-mini', 'type': 'ai'},
 'reference_contexts': ['The standard applies to \nactuaries designing, pricing, or determining funding of an LTC benefit plan. The standard \nalso applies to actuaries measuring or evaluating LTC liabilities within a n LTC benefit \nplan. The term “ long- term care benefit plan” includes plans with short -term (for \nexample, less than twelve consecutive months) and long- term benefit dura tions. The \nstandard does not apply to actuaries providing actuarial services related to LTC benefits \nfor Medicaid -eligible recipients , unless the actuarial services are for a long -range financial \nprojection (generally more than five years) of LTC benefit expenditures and eligible \nrecipients under the Medicaid program. \n \nIf the actuary is reviewing actuarial services performed with respect to LTC benefit plans , \nthe actuary sh