### Wall of Imports

In [3]:
# 🔹 Standard Library Imports
import os
import getpass
import json
import shutil
import tarfile
import subprocess
import fnmatch
import xml.etree.ElementTree as ET
from ftplib import FTP

# 🔹 Third-Party Libraries
import requests
import pandas as pd  # Using standard alias
from bs4 import BeautifulSoup

# 🔹 LangChain Core Components
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

# 🔹 LangChain Loaders & Splitters
from langchain_community.document_loaders import JSONLoader
from langchain.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveJsonSplitter, HTMLSectionSplitter

# 🔹 LangChain Embeddings & Vector Stores
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

# 🔹 RAGAS Evaluation Tools
from ragas import EvaluationDataset, evaluate, RunConfig
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from ragas.metrics import (
    LLMContextRecall, 
    Faithfulness, 
    FactualCorrectness, 
    ResponseRelevancy, 
    ContextEntityRecall, 
    NoiseSensitivity
)

# 🔹 Utility Imports
import pprint  # Useful for structured data printing
from operator import itemgetter

  from .autonotebook import tqdm as notebook_tqdm


### Enter API Keys

In [33]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
os.environ["QDRANT_API_KEY"] = getpass.getpass("Qdrant API Key:")
os.environ["TAVILY_API_KEY"] = getpass.getpass("Tavily API Key:")

### Retrieve Projects (University of Utah Pediatrics projects only for the time being)

In [8]:
# Define the directory and file path
directory = 'data'
file_path = os.path.join(directory, 'projects_data.json')

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

# Define the endpoint URL
url = "https://api.reporter.nih.gov/v2/projects/search"

# Define the JSON body for the request
json_body = {
        "criteria":
        {
            "org_names": ["UNIVERSITY OF UTAH"],
            "dept_types": ["PEDIATRICS"]
        },
        "include_fields": [
             "ApplId","SubprojectId","FiscalYear","Organization", "ProjectNum",
             "ProjectNumSplit","ContactPiName","AllText","FullStudySection",
             "ProjectStartDate","ProjectEndDate", "AwardAmount", "ActivityCode", 
             "AgencyIcAdmin", "AwardType", "AwardNoticeDate", "BudgetStart", 
             "BudgetEnd", "CoreProjectNum", "OrganizationType", "OpportunityNumber",
             "AgencyIcFundings", "FundingMechanism", "SpendingCategoriesDesc", "PhrText",
             "PrincipalInvestigators", "PrefTerms", "ProjectTitle", "DirectCostAmt",
             "IndirectCostAmt", "IsActive", "Terms", "AbstractText", "AgencyCode",
             "ProjectDetailUrl"
         ],
         "offset":0,
         "limit":100,
         "sort_field":"project_start_date",
          "sort_order":"desc"
   }

# Make the POST request with JSON body
response = requests.post(url, json=json_body)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Save the JSON data to a file
    
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)  # indent=4 for pretty printing
    # Do something with the data
    #pprint.pprint(data)
else:
    print(f"Request failed with status code {response.status_code}")

### Load Funding Opportunities (no API access sadly)

In [46]:
# Path to your CSV file
csv_file_path = "data/Opportunities/NIH_Funding_Opportunities_20250221.csv"

# Load CSV file as LangChain documents
loader = CSVLoader(file_path=csv_file_path)
opportunities_summary = loader.load()

print(opportunities_summary[0:10])
len(opportunities_summary)

[Document(metadata={'source': 'data/Opportunities/NIH_Funding_Opportunities_20250221.csv', 'row': 0}, page_content='Title: Intervention Research to Improve Native American Health (R34 Clinical Trial Optional)\nRelease_Date: 1/22/2025\nExpired_Date: 1/8/2027\nActivity_Code: R34\nParent_Organization: NIH\nOrganization: NIDA\nParticipating_Orgs: NCCIH, NIA, NIAAA, NICHD, NIDCR, NIEHS, NIMH, NINR, OBSSR, ODP, ORWH, THRO\nDocument_Number: PAR-25-378\nDocument_Type: PAR\nClinical_Trials: Optional\nURL: https://grants.nih.gov/grants/guide/pa-files/PAR-25-378.html'), Document(metadata={'source': 'data/Opportunities/NIH_Funding_Opportunities_20250221.csv', 'row': 1}, page_content='Title: New Investigator Gateway Awards for Collaborative T1D Research (R03 Clinical Trial Not Allowed)\nRelease_Date: 1/22/2025\nExpired_Date: 3/7/2026\nActivity_Code: R03\nParent_Organization: NIH\nOrganization: NIDDK\nParticipating_Orgs: OD\nDocument_Number: RFA-DK-26-009\nDocument_Type: RFA\nClinical_Trials: Not_Al

528

### Upload Opportunities Summary to Qdrant

In [52]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

url = "https://e788c0ea-f5df-4d96-85ac-350da677aadf.us-west-2-0.aws.cloud.qdrant.io"

opportunities_summary_qdrant = QdrantVectorStore.from_documents(
    opportunities_summary,
    embeddings,
    url=url,
    prefer_grpc=False,
    api_key=os.environ["QDRANT_API_KEY"],
    collection_name="opportunities_summary",
)

### Retrieve Opportunity Details

In [12]:
# Limit opportunity count
opportunity_count = 100

# Path to the CSV file
csv_file_path = "data/Opportunities/NIH_Funding_Opportunities_20250221.csv"

# Load the CSV data into a DataFrame
df = pandas.read_csv(csv_file_path)

# Extract the html files 
counter = 0

for index, row in df.iterrows():
    output_file = "data/Opportunities/" + row['Document_Number'] + ".html"
    # Construct the curl command
    curl_command = [
        "curl",
        "-X", "GET",  # or "POST", "PUT", etc.
        row['URL'],
        "-o", output_file
    ]
    
    # Execute the curl command
    try:
        result = subprocess.run(curl_command, check=True, text=True, capture_output=True)
    except subprocess.CalledProcessError as e:
        print("An error occurred:", e)
        
    counter += 1
    
    # Limit to opportunity count
    if counter > opportunity_count:
        break

### Create HTML splitter

In [13]:
headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")]

html_splitter = HTMLSectionSplitter(
    headers_to_split_on=headers_to_split_on
)

### Clean up HTML files and split (for the golden chunks)

In [20]:
from bs4 import BeautifulSoup

# Path to the directory containing HTML files
directory_path = "data/Opportunities"

def clean_html(html_raw):
    soup = BeautifulSoup(html_raw.replace("\n", "").replace("\t", ""), 'html.parser')
    # Remove div tags (could use some more work)
    for div_tag in soup.find_all('div'):
        div_tag.replaceWithChildren()
    # Remove script tags 
    for script in soup.find_all('script'):
        script.decompose()
    # Remove title tags
    for title in soup.find_all('title'):
        opportunity_title = title.string
        title.decompose()
    # Remove meta tags 
    for meta in soup.find_all('meta'):
        meta.decompose()
    # Add metadata to header tags 
    for header in soup.find_all(['h1', 'h2', 'h3']):
        if header.string is not None:
            header.string.replace_with(f"{opportunity_title} {header.string}")
    # Return clean file
    return soup.prettify() 

# Initialize opportunities document list
opportunities = []

# Iterate over each file in the directory
for filename in os.listdir(directory_path):
    # Check if the file is an HTML file
    if fnmatch.fnmatch(filename, '*.html'):
        file_path = os.path.join(directory_path, filename)
        #Read html file
        with open(file_path, 'r') as f:
            html_raw = f.read()
        #Add cleaned and chunked html file to opportunities list
        opportunities.extend(html_splitter.split_text(clean_html(html_raw)))

# Add metadata to page content
for row in opportunities:
    if 'Header 1' in row.metadata:
        row.page_content = row.metadata['Header 1'] + ' ' + row.page_content
    elif 'Header 2' in row.metadata:
        row.page_content = row.metadata['Header 2'] + ' ' + row.page_content
    elif 'Header 3' in row.metadata:
        row.page_content = row.metadata['Header 3'] + ' ' + row.page_content

# Remove #TITLE# rows
opportunities = [doc for doc in opportunities if doc.metadata.get('Header 1') != '#TITLE#']

len(opportunities)
pprint.pprint(opportunities[0:10])

### Clean Up HTML Files

In [1]:
def clean_up_html_files(directory_path):
    # Iterate over each file in the directory
    for filename in os.listdir(directory_path):
        # Check if the file is an HTML file
        if fnmatch.fnmatch(filename, '*.html'):
            file_path = os.path.join(directory_path, filename)
            # Delete the HTML file
            os.remove(file_path)
            print(f"Deleted file: {file_path}")

In [4]:
# Path to the directory containing HTML files
directory_path = "data/Opportunities"

clean_up_html_files(directory_path)

Deleted file: data/Opportunities/PAR-25-377.html
Deleted file: data/Opportunities/PAR-25-336.html
Deleted file: data/Opportunities/PAR-25-048.html
Deleted file: data/Opportunities/RFA-CA-25-004.html
Deleted file: data/Opportunities/PAR-25-101.html
Deleted file: data/Opportunities/PA-25-303.html
Deleted file: data/Opportunities/PAR-25-316.html
Deleted file: data/Opportunities/RFA-AI-24-080.html
Deleted file: data/Opportunities/PAR-25-068.html
Deleted file: data/Opportunities/RFA-AI-24-079.html
Deleted file: data/Opportunities/PAR-25-091.html
Deleted file: data/Opportunities/PAR-25-357.html
Deleted file: data/Opportunities/PAR-25-029.html
Deleted file: data/Opportunities/RFA-DK-26-006.html
Deleted file: data/Opportunities/RFA-DK-26-007.html
Deleted file: data/Opportunities/PAR-25-028.html
Deleted file: data/Opportunities/PAR-25-090.html
Deleted file: data/Opportunities/PAS-25-236.html
Deleted file: data/Opportunities/PAR-25-317.html
Deleted file: data/Opportunities/PA-25-302.html
Deleted

### Load Opportunities into Qdrant

In [None]:
finetune_embeddings = HuggingFaceEmbeddings(model_name="christinemahler/aie5-midterm")
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

url = "https://e788c0ea-f5df-4d96-85ac-350da677aadf.us-west-2-0.aws.cloud.qdrant.io"

opportunities_qdrant = QdrantVectorStore.from_documents(
    opportunities,
    embedding_model,
    url=url,
    prefer_grpc=False,
    api_key=os.environ["QDRANT_API_KEY"],
    collection_name="opportunities",
)

### Load Projects
Reference: https://python.langchain.com/v0.2/docs/how_to/document_loader_json/

In [10]:
loader = JSONLoader(
    file_path='data/projects_data.json',
    jq_schema='.results[]',
    text_content=False)

projects = loader.load()

pprint.pprint(projects)

[Document(metadata={'source': '/Users/christinemahler/Desktop/AIE5/Midterm/data/projects_data.json', 'seq_num': 1}, page_content='{"appl_id": 10985653, "subproject_id": null, "fiscal_year": 2024, "project_num": "1K23HD113825-01A1", "organization": {"org_name": "UNIVERSITY OF UTAH", "city": null, "country": null, "org_city": "SALT LAKE CITY", "org_country": "UNITED STATES", "org_state": "UT", "org_state_name": null, "dept_type": "PEDIATRICS", "fips_country_code": null, "org_duns": ["009095365"], "org_ueis": ["LL8GLEVH6MG3"], "primary_duns": "009095365", "primary_uei": "LL8GLEVH6MG3", "org_fips": "US", "org_ipf_code": "514002", "org_zipcode": "841129049", "external_org_id": 514002}, "award_type": "1", "activity_code": "K23", "award_amount": 167455, "is_active": true, "project_num_split": {"appl_type_code": "1", "activity_code": "K23", "ic_code": "HD", "serial_num": "113825", "support_year": "01", "full_support_year": "01A1", "suffix_code": "A1"}, "principal_investigators": [{"profile_id"

### JSON Splitter (not sure if this is needed)

In [22]:
splitter = RecursiveJsonSplitter(max_chunk_size=300)

# Recursively split json data - If you need to access/manipulate the smaller json chunks
json_chunks = splitter.split_json(json_data=data)

for chunk in json_chunks[:3]:
    print(chunk)

NameError: name 'data' is not defined

### Load Projects into Vector Store

In [12]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

projects_qdrant = QdrantVectorStore.from_documents(
    projects,
    embeddings,
    url=url,
    prefer_grpc=False,
    api_key=os.environ["QDRANT_API_KEY"],
    collection_name="projects",
)

ResponseHandlingException: timed out

### Ragas Evaluation

In [42]:
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [44]:
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(opportunities, testset_size=10)

Applying HeadlinesExtractor:  48%|████▊     | 40/84 [02:26<03:44,  5.11s/it]unable to apply transformation: Error code: 429 - {'error': {'message': 'Request too large for gpt-4o in organization org-fG4XzykR5yuSt2Wp6ZlK7gAJ on tokens per min (TPM): Limit 30000, Requested 43020. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Applying HeadlinesExtractor:  86%|████████▌ | 72/84 [05:58<02:01, 10.13s/it]unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-fG4XzykR5yuSt2Wp6ZlK7gAJ on tokens per min (TPM): Limit 30000, Used 29600, Requested 8077. Please try again in 15.354s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Applying HeadlineSplitter:   0%|          | 0/132 [00:00<?

In [45]:
dataset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What is the AADCRC program and how does it con...,[Objectives and Scope The objective of this NO...,The AADCRC program aims to improve the underst...,single_hop_specifc_query_synthesizer
1,What role does a Scientific Core play in the A...,[AADCRC components Administrative Core (requir...,A Scientific Core in the AADCRC structure is o...,single_hop_specifc_query_synthesizer
2,Wht r the eligiblity critera for Alaskaa Nativ...,[RFA-AI-24-079: Asthma and Allergic Diseases C...,Alaska Native and Native Hawaiian Serving Inst...,single_hop_specifc_query_synthesizer
3,Wht are the eligibilty critera for NIH IPF num...,[Eligible Individuals (Program Director/Princi...,"Only one application per institution, identifi...",single_hop_specifc_query_synthesizer
4,What is the role of the Clinical Core in suppo...,"[Administrative Core: required, 1 Data Steward...",The Clinical Core is optional and can have a m...,single_hop_specifc_query_synthesizer
5,What is the role of the Data and Safety Monito...,"[When involving human subjects research, clini...","For all clinical trials, NIAID will provide a ...",single_hop_specifc_query_synthesizer
6,Wht is the role of ASSIST in the clinical core...,[of the Overall component. Note : Specific det...,ASSIST screens will show an asterisk for the P...,single_hop_specifc_query_synthesizer
7,What go in appendix?,"[alternative approaches to be implemented, if ...",Only limited items are allowed in the Appendix...,single_hop_specifc_query_synthesizer
8,Wht is NIAID's role in clinical trials?,[Apply- Application Guide must be followed. De...,"For all clinical trials, NIAID will provide a ...",single_hop_specifc_query_synthesizer
9,What is System for Award Management and why it...,"[protocol development, the plan for study impl...",The System for Award Management (SAM) is a req...,single_hop_specifc_query_synthesizer


In [46]:
dataset.upload()

Testset uploaded! View at https://app.ragas.io/dashboard/alignment/testset/0842bbec-e388-4a45-8e5f-faf4950361f9


'https://app.ragas.io/dashboard/alignment/testset/0842bbec-e388-4a45-8e5f-faf4950361f9'

In [47]:
for test_row in dataset:
  response = graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [48]:
evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

In [49]:
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

In [50]:
custom_run_config = RunConfig(timeout=360)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
result

Evaluating:  12%|█▏        | 7/60 [00:14<02:06,  2.39s/it]Exception raised in Job[20]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating:  15%|█▌        | 9/60 [00:36<05:41,  6.70s/it]Exception raised in Job[14]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating:  22%|██▏       | 13/60 [00:41<01:56,  2.47s/it]Exception raised in Job[26]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating:  38%|███▊      | 23/60 [02:02<03:53,  6.32s/it]Exception raised in Job[38]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting r

{'context_recall': 0.5000, 'faithfulness': 0.3056, 'factual_correctness': 0.6250, 'answer_relevancy': 0.3755, 'context_entity_recall': 0.3567, 'noise_sensitivity_relevant': 0.0667}