In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Configuration
num_records = 1000
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

hcp_names = ["Dr. A", "Dr. B", "Dr. C", "Dr. D", "Dr. E", "Dr. F", "Dr. G", "Dr. H", "Dr. I", "Dr. J"]
doctor_names = ["MSL X", "MSL Y", "MSL Z", "MSL P", "MSL Q", "MSL R"]
hcp_specialties = ["Oncologist", "Pulmonologist", "Thoracic Surgeon", "Radiologist", "Pathologist"]

custom_sentiment_distribution = {
    "Q1": ["Negative"] * 70 + ["Neutral"] * 20 + ["Positive"] * 10,
    "Q2": ["Negative"] * 60 + ["Neutral"] * 25 + ["Positive"] * 15,
    "Q3": ["Negative"] * 20 + ["Neutral"] * 30 + ["Positive"] * 50,
    "Q4": ["Negative"] *10 + ["Neutral"] * 15 + ["Positive"] * 75,
}


# custom_sentiment_distribution = {
#     "Q1": ["Negative"] * 70 + ["Neutral"] * 20 + ["Positive"] * 10,
#     "Q2": ["Negative"] * 60 + ["Neutral"] * 25 + ["Positive"] * 15,
#     "Q3": ["Negative"] * 50 + ["Neutral"] * 30 + ["Positive"] * 20,
#     "Q4": ["Negative"] * 30 + ["Neutral"] * 30 + ["Positive"] * 40,
# }

negative_templates = [
    "{} expressed hesitation about switching patients to 2L treatment due to severe toxicities experienced during 1L therapy.",
    "{} mentioned NSCLC patients on chemotherapy experiencing issues like neutropenia, anemia, GI toxicity, and pneumonitis, but remaining on 1L therapy too long.",
    "{} mentioned NSCLC patients on immunotherapy experiencing issues like neutropenia, anemia, GI toxicity, and pneumonitis, but remaining on 1L therapy too long.",
    "{} raised concerns about unclear sequencing strategies, delaying necessary transitions to 2L.",
    "{} noted lack of confidence in 2L treatment options due to limited real-world evidence in the community.",
    "{} reported that many patients remain on 1L therapy longer than necessary due to limited options in 2L treatment.",
    "{} indicated hesitation in moving patients to 2L therapy because of adverse effects experienced during 1L treatment.",
    "{} reported that they are reluctant to switch to 2L therapy due to lack of clinical understanding and experience.",
    "{} indicated an increase in patient hospitalization and hence reduced quality of life."
]

positive_templates = [
    "{} shared positive experiences with new clinical trials focusing on safety profiles enrolling more patients.",
    "{} noted a significant improvement in sentiment towards upcoming 2L options due to emerging clinical evidence.",
    "{} mentioned ongoing discussions focusing on sequencing strategies and toxicity management showing promising results.",
    "{} expressed interest in new 2L options demonstrating better safety and tolerability for patients.",
    "{} highlighted progress in trials aimed at reducing toxicities experienced during earlier therapies.",
    "{} reported growing confidence in switching patients to 2L treatment due to improved data on safety profiles.",
    "{} noted increasing patient enrollments in clinical trials addressing issues related to severe toxicities."
]

neutral_templates = [
    "{} discussed current treatment strategies for NSCLC patients facing severe toxicities.",
    "{} reviewed data on treatment discontinuation patterns and patient outcomes in NSCLC.",
    "{} raised concerns about patients discontinuing therapy without transitioning to better-tolerated 2L options.",
    "{} provided general feedback on managing toxicities like neutropenia and fatigue in NSCLC patients.",
    "{} highlighted clinical challenges associated with sequencing strategies in NSCLC treatment.",
    "{} indicated ongoing debates around when to transition patients from 1L to 2L therapy.",
    "{} discussed monitoring patients with severe toxicities during 1L treatment to optimize outcomes."
]

unique_custom_records = []
for _ in range(num_records):
    random_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    quarter = f"Q{((random_date.month - 1) // 3) + 1}"
    sentiment = random.choice(custom_sentiment_distribution[quarter])


    hcp_name = random.choice(hcp_names)
    hcp_specialty = random.choice(hcp_specialties)
    msl_name = random.choice(doctor_names)

    if sentiment == "Negative":
        note = random.choice(negative_templates).format(hcp_name)
    elif sentiment == "Positive":
        note = random.choice(positive_templates).format(hcp_name)
    else:
        note = random.choice(neutral_templates).format(hcp_name)

    note += f" ({random.choice(['Discussed during recent meeting.', 'Follow-up planned.', 'Awaiting more evidence.'])})"

    unique_custom_records.append({
        "Date": random_date.strftime("%Y-%m-%d"),
        "MSL Note": note,
        "Therapy Area": "Non-small cell lung cancer (NSCLC)",
        "Sentiment": sentiment,
        "HCP Name": hcp_name,
        "HCP Specialty": hcp_specialty,
        "MSL Name": msl_name,
        "Region": random.choice(["North America", "Europe", "Asia", "South America"])
    })

# Enhancing variation by adding more randomness and diversity to the notes
additional_phrases = [
    "This remains a major concern among oncologists.",
    "The current situation is far from ideal.",
    "This issue needs more attention from the medical community.",
    "Oncologists are actively looking for better options.",
    "The lack of real-world evidence is problematic.",
    "There is a need for clearer treatment guidelines.",
    "Newer approaches are being considered by experts.",
    "Clinical data is awaited to support better decisions.",
    "Efforts are ongoing to improve patient outcomes.",
    "The safety profile of new therapies is a key discussion point."
]

# Updating existing notes by randomly appending additional phrases for uniqueness
for record in unique_custom_records:
    if random.random() < 0.7:  # 70% chance of adding an additional phrase for uniqueness
        record["MSL Note"] += " " + random.choice(additional_phrases)

# Creating the updated DataFrame
df_unique_updated = pd.DataFrame(unique_custom_records)

df_unique_updated["Date"] = pd.to_datetime(df_unique_updated["Date"])

# Checking if the number of unique MSL notes is now 1000
num_unique_notes_updated = df_unique_updated["MSL Note"].nunique()
print("Total unique notes: ",num_unique_notes_updated)

Total unique notes:  911


In [2]:
import pandas as pd
import uuid
import os

# Define the directory where you want to store the text files.
output_directory = 'input'

# Check if the directory exists, if not, create it
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Loop through each row of the dataframe
for index, row in df_unique_updated.iterrows():
    msl_note = row['MSL Note']

    # Generate a unique file name using uuid
    unique_filename = f"MSL_Note_{uuid.uuid4().hex}.txt"

    # Define the full path for the text file
    file_path = os.path.join(output_directory, unique_filename)

    # Write the MSL Mote to a text file
    with open(file_path, 'w') as file:
        file.write(msl_note)

    print(f" File '{file_path}' created successfully")

 File 'input/MSL_Note_fb453856a8994904a690ccf6f018901e.txt' created successfully
 File 'input/MSL_Note_041ac886a2da470b93f652e753102e27.txt' created successfully
 File 'input/MSL_Note_bfdb23fb2573401bbedad4d61c048062.txt' created successfully
 File 'input/MSL_Note_98973de1bf9f4d99aab39b27416d0850.txt' created successfully
 File 'input/MSL_Note_0ece0f2eab7945f592270b3e3e215a38.txt' created successfully
 File 'input/MSL_Note_cc0db9f0cdad4fb7a2f8a4ad7aec79fc.txt' created successfully
 File 'input/MSL_Note_6c35d3b6f695438a8c1ba2d27c766075.txt' created successfully
 File 'input/MSL_Note_50dab4df3c5147a6994957b3ff6b67bd.txt' created successfully
 File 'input/MSL_Note_27014b8f470c4f8abe06d391dc049256.txt' created successfully
 File 'input/MSL_Note_9c222d92e88f490ca38b8e11f9d34cd0.txt' created successfully
 File 'input/MSL_Note_698f151a4de34f7588f0fb43f76ddcdc.txt' created successfully
 File 'input/MSL_Note_78b03e643054430cbf14737a52a736d6.txt' created successfully
 File 'input/MSL_Note_e55413

In [None]:
df_unique_updated

In [None]:
pip install tiktoken

In [2]:
import os
import yaml
import pandas as pd
import tiktoken
import asyncio
from rich import print
from graphrag.query.indexer_adapters import (
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.input.loaders.dfs import store_entity_semantic_embeddings
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.local_search.mixed_context import LocalSearchMixedContext
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.vector_stores.lancedb import LanceDBVectorStore

def load_config(config_path="settings.yaml"):
    with open(config_path, "r") as file:
        config = yaml.safe_load(file)
    return config

# Access configuration
config = load_config()

# 1. Setup LLM
api_key = config['llm']['api_key']
llm_model = config['llm']['model']
api_base = config['llm']['api_base']
api_version = config['llm']['api_version']
embedding_model = config['embeddings']['llm']['model']
deployment_name = config['llm']['deployment_name']
llm = ChatOpenAI(api_key=api_key, api_base=api_base, api_version=api_version, model=llm_model, deployment_name=deployment_name, api_type=OpenaiApiType.AzureOpenAI, max_retries=20)
token_encoder = tiktoken.get_encoding("cl100k_base")
embedding_deployment_name = config['embeddings']['llm']['deployment_name']
text_embedder = OpenAIEmbedding(api_key=api_key, api_base=api_base, api_version=api_version, deployment_name=embedding_deployment_name, api_type=OpenaiApiType.AzureOpenAI, model=embedding_model, max_retries=20)

# 2. Load the Context
INPUT_DIR = "output"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
TEXT_UNIT_TABLE = "create_final_text_units"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
COMMUNITY_LEVEL = 0

entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
print(entity_df.columns)
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
print(entity_embedding_df.columns)
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

description_embedding_store = LanceDBVectorStore(collection_name="entity_description_embeddings")
description_embedding_store.connect(db_uri=LANCEDB_URI)
store_entity_semantic_embeddings(entities=entities, vectorstore=description_embedding_store)

relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
print(relationship_df.columns)
relationships = read_indexer_relationships(relationship_df)

report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

# 4. Setup Local Search
local_context_params = { "text_unit_prop": 0.5, "community_prop": 0.1, "conversation_history_max_turns": 5, "conversation_history_user_turns_only": True, "top_k_mapped_entities": 10, "top_k_relationships": 10, "include_entity_rank": True, "include_relationship_weight": True, "include_community_rank": False, "return_candidate_context": False, "max_tokens": 12_000, }
llm_params = { "max_tokens": 2_000, "temperature": 0.0, }
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",
)
print("search engine: \n")
# print(search_engine.__dict__)
import pprint
pprint.pprint(vars(search_engine.context_builder))



# 5. Run Local Search
async def run_search(query: str):
    result = await search_engine.asearch(query)
    return result

if __name__ == "__main__":
    query = "Who is Twemlow?"
    result = await run_search(query)
    generated_response = result.response
    print("Response: \n",result.response)
    print("Entities: \n",result.context_data["entities"].head())
    print("Relationships: \n",result.context_data["relationships"].head())
    print("Reports: \n",result.context_data["reports"].head())
    sources = result.context_data["sources"]
    # print("Sources: \n",result.context_data["sources"])
    if "claims" in result.context_data:
        print(result.context_data["claims"].head())

KeyError: 'name'

In [None]:
pip install graphrag==0.2.1

In [None]:
import graphrag.query.input.loaders.dfs as dfs
print(dir(dfs))