In [1]:
import os
from lightrag import LightRAG, QueryParam
from lightrag.llm import gpt_4o_mini_complete, gpt_4o_complete
from dotenv import load_dotenv


# Load environment variables from .env file
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


#########
# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert()
import nest_asyncio
nest_asyncio.apply()
#########


import textract


WORKING_DIR = "./data/Affiliations/"


if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)
    
    
# # file_path = WORKING_DIR + '/Pubs_sheet_data0.csv'
# file_path = WORKING_DIR + 'Affiliations_sheet.csv'
# text_content = textract.process(file_path)

## Compressing the content
import pandas as pd
df = pd.read_csv(WORKING_DIR + 'Affiliations_sheet.csv')
df[:100].to_csv(WORKING_DIR + 'Affiliations_sheet_compressed.csv', index=False)
file_path = WORKING_DIR + 'Affiliations_sheet_compressed.csv'
text_content = textract.process(file_path)
####################


rag = LightRAG(
    working_dir=WORKING_DIR,
    llm_model_func=gpt_4o_mini_complete,  # Use gpt_4o_mini_complete LLM model
     llm_model_max_async=1
    # llm_model_func=gpt_4o_complete  # Optionally, use a stronger model
)

# rag.insert(text_content.decode('utf-8'))


  from .autonotebook import tqdm as notebook_tqdm
INFO:lightrag:Logger initialized for working directory: ./data/Affiliations/
DEBUG:lightrag:LightRAG init with param:
  working_dir = ./data/Affiliations/,
  chunk_token_size = 1200,
  chunk_overlap_token_size = 100,
  tiktoken_model_name = gpt-4o-mini,
  entity_extract_max_gleaning = 1,
  entity_summary_to_max_tokens = 500,
  node_embedding_algorithm = node2vec,
  node2vec_params = {'dimensions': 1536, 'num_walks': 10, 'walk_length': 40, 'window_size': 2, 'iterations': 3, 'random_seed': 3},
  embedding_func = {'embedding_dim': 1536, 'max_token_size': 8192, 'func': <function openai_embedding at 0x13cda0f70>},
  embedding_batch_num = 32,
  embedding_func_max_async = 16,
  llm_model_func = <function gpt_4o_mini_complete at 0x13cda0940>,
  llm_model_name = meta-llama/Llama-3.2-1B-Instruct,
  llm_model_max_token_size = 32768,
  llm_model_max_async = 1,
  llm_model_kwargs = {},
  key_string_value_json_storage_cls = <class 'lightrag.storage.J

In [9]:
text_content.decode('utf-8')

"Name\tTitle\tOrganization\tscholars_school_name\tappointment_type\taffiliation_type\tscholars_department_name\tscholars_division_name\tprofile_url\tstart_date\tlast_name\tmiddle_name\tfirst_name\tid\n. Shaheryar\tStudent\tEconomics\tTrinity College of Arts & Sciences\tStudent\tNon-Faculty\tEconomics\t\thttps://scholars.duke.edu/profile/1205358\t\tShaheryar\t\t.\t1205358\nA. Brad Murray\tProfessor of Geomorphology and Coastal Processes\tEarth and Climate Sciences\tNicholas School of the Environment\tFaculty - Primary Appointment\tFaculty\tEarth and Climate Sciences\t\thttps://scholars.duke.edu/profile/0200795\t01-03-2011\tMurray\t\tA. Brad\t200795\nA. Craig Burnside\tProfessor of Economics\tEconomics\tTrinity College of Arts & Sciences\tFaculty - Primary Appointment\tFaculty\tEconomics\t\thttps://scholars.duke.edu/profile/0331721\t01-07-2004\tBurnside\tCraig\tA.\t331721\nA. Eugene Washington\tChancellor Emeritus\tSchool of Medicine\tSchool of Medicine\tFaculty - Administrative Appointm

In [3]:
# Perform naive search
print(rag.query("Tell me about the professors?", param=QueryParam(mode="naive")))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:lightrag:Truncate 6 to 3 chunks


Duke University hosts a diverse array of professors across various departments and schools, contributing significantly to their fields of expertise. Among them, Aatif Mairaj Husain stands out as a prominent figure in Neurology. He holds the title of Professor of Neurology with specializations in Epilepsy and Sleep. His tenure at Duke began on January 1, 2012, and he has further distinguished himself by serving as the Director of the Neurodiagnostic Lab and as Division Chief in Epilepsy, Sleep, and Neurophysiology. Additionally, he is a member of the Duke Clinical Research Institute.

Another esteemed professor is A. Brad Murray, a specialist in Geomorphology and Coastal Processes, based in the Earth and Climate Sciences department. He has been with Duke since March 1, 2011, and plays an integral role in educating students about environmental sciences.

In the realm of Economics, A. Craig Burnside has been a key faculty member since July 1, 2004. His contributions to the field are found

In [4]:
# Perform local search
print(rag.query("Tell me about the professors?", param=QueryParam(mode="local")))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:lightrag:Local query uses 60 entites, 54 relations, 3 text units
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


### Overview of Professors at Duke University

Duke University is home to a diverse group of professors across various disciplines, primarily situated within its multiple schools and research institutes. The professors at Duke not only engage in teaching but also contribute significantly to research, fostering a vibrant academic environment.

### Professors in Medicine and Health Sciences

1. **Husain Mairaj Aatif** - *Professor of Neurology*
   - Specializes in **Epilepsy and Sleep**. He is also active in clinical research through his involvement with the **Duke Clinical Research Institute**.

2. **Aaron Timothy Vose** - *Medical Instructor in the Department of Medicine*
   - Focuses on **Pulmonary, Allergy, and Critical Care Medicine**. 

3. **A. Eugene Washington** - *Chancellor Emeritus of the School of Medicine*
   - Holds various faculty roles and is known for his extensive contributions to medical education and research.

4. **Adam I. Perlman** - *Adjunct Associate Professor in 

In [5]:
# Perform global search
print(rag.query("Tell me about the professors?", param=QueryParam(mode="global")))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:lightrag:Global query uses 75 entites, 60 relations, 3 text units
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


### Faculty Overview at Duke University

Duke University is home to a diverse and accomplished faculty across various disciplines, reflecting a rich academic environment focused on research, education, and community engagement. This overview highlights several notable professors from different departments and schools within the university.

#### Faculty in the School of Medicine

1. **Husain Mairaj Aatif**:
   - **Position**: Professor of Neurology
   - **Specializations**: Epilepsy and Sleep.
   - Husain Mairaj Aatif holds significant roles, including membership at the Duke Clinical Research Institute, enhancing his contributions to neurology.

2. **Aaron Feiger**:
   - **Position**: Assistant Professor of Psychiatry and Behavioral Sciences
   - **Focus**: Adult Psychiatry & Psychology.
   - His work emphasizes practical application and research in mental health.

3. **Adam I Perlman**:
   - **Position**: Adjunct Associate Professor in General Internal Medicine.
   - Perlman’s contrib

In [8]:
# Perform hybrid search
print(rag.query("Tell me about the professors, give me their ID as well?", param=QueryParam(mode="hybrid")))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:lightrag:Local query uses 60 entites, 36 relations, 3 text units
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:lightrag:Global query uses 76 entites, 60 relations, 3 text units
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


### Professors at Duke University

Duke University has a diverse faculty with numerous professors across various disciplines. Here's a selection of professors, including their IDs and areas of expertise:

#### Academic Faculty Members

1. **A. Craig Burnside**
   - **ID:** 3
   - **Position:** Professor of Economics
   - **Department:** Economics, Trinity College of Arts & Sciences

2. **Adam I Perlman**
   - **ID:** 27
   - **Position:** Adjunct Associate Professor
   - **Department:** Medicine, General Internal Medicine, School of Medicine

3. **Aaron D. Franklin**
   - **ID:** 30
   - **Position:** Associate Dean for Faculty Affairs and Professor
   - **Departments:** Electrical and Computer Engineering, Chemistry, Pratt School of Engineering

4. **Abigail Hannah Melnick**
   - **ID:** 40
   - **Position:** Assistant Professor of Anesthesiology
   - **Department:** School of Medicine

5. **Aabhu Benmamoun**
   - **ID:** 29
   - **Position:** Director of the Linguistics Program and P

In [7]:
# import os
# import json
# from lightrag.utils import xml_to_json
# from neo4j import GraphDatabase

# from dotenv import load_dotenv


# # Load environment variables from .env file
# load_dotenv()



# # Constants
# WORKING_DIR = "./data/Affiliations/"
# BATCH_SIZE_NODES = 500
# BATCH_SIZE_EDGES = 100

# # Neo4j connection credentials
# NEO4J_URI = os.getenv("neo4j_uri")
# NEO4J_USERNAME = os.getenv("neo4j_user")
# NEO4J_PASSWORD = os.getenv("neo4j_password")

# def convert_xml_to_json(xml_path, output_path):
#     """Converts XML file to JSON and saves the output."""
#     if not os.path.exists(xml_path):
#         print(f"Error: File not found - {xml_path}")
#         return None

#     json_data = xml_to_json(xml_path)
#     if json_data:
#         with open(output_path, 'w', encoding='utf-8') as f:
#             json.dump(json_data, f, ensure_ascii=False, indent=2)
#         print(f"JSON file created: {output_path}")
#         return json_data
#     else:
#         print("Failed to create JSON data")
#         return None

# def process_in_batches(tx, query, data, batch_size):
#     """Process data in batches and execute the given query."""
#     for i in range(0, len(data), batch_size):
#         batch = data[i:i + batch_size]
#         tx.run(query, {"nodes": batch} if "nodes" in query else {"edges": batch})

# def main():
#     # Paths
#     xml_file = os.path.join(WORKING_DIR, 'graph_chunk_entity_relation.graphml')
#     json_file = os.path.join(WORKING_DIR, 'graph_data.json')

#     # Convert XML to JSON
#     json_data = convert_xml_to_json(xml_file, json_file)
#     if json_data is None:
#         return

#     # Load nodes and edges
#     nodes = json_data.get('nodes', [])
#     edges = json_data.get('edges', [])

#     # Neo4j queries
#     create_nodes_query = """
#     UNWIND $nodes AS node
#     MERGE (e:Entity {id: node.id})
#     SET e.entity_type = node.entity_type,
#         e.description = node.description,
#         e.source_id = node.source_id,
#         e.displayName = node.id
#     REMOVE e:Entity
#     WITH e, node
#     CALL apoc.create.addLabels(e, [node.entity_type]) YIELD node AS labeledNode
#     RETURN count(*)
#     """

#     create_edges_query = """
#     UNWIND $edges AS edge
#     MATCH (source {id: edge.source})
#     MATCH (target {id: edge.target})
#     WITH source, target, edge,
#          CASE
#             WHEN edge.keywords CONTAINS 'lead' THEN 'lead'
#             WHEN edge.keywords CONTAINS 'participate' THEN 'participate'
#             WHEN edge.keywords CONTAINS 'uses' THEN 'uses'
#             WHEN edge.keywords CONTAINS 'located' THEN 'located'
#             WHEN edge.keywords CONTAINS 'occurs' THEN 'occurs'
#            ELSE REPLACE(SPLIT(edge.keywords, ',')[0], '\"', '')
#          END AS relType
#     CALL apoc.create.relationship(source, relType, {
#       weight: edge.weight,
#       description: edge.description,
#       keywords: edge.keywords,
#       source_id: edge.source_id
#     }, target) YIELD rel
#     RETURN count(*)
#     """

#     set_displayname_and_labels_query = """
#     MATCH (n)
#     SET n.displayName = n.id
#     WITH n
#     CALL apoc.create.setLabels(n, [n.entity_type]) YIELD node
#     RETURN count(*)
#     """

#     # Create a Neo4j driver
#     driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

#     try:
#         # Execute queries in batches
#         with driver.session() as session:
#             # Insert nodes in batches
#             session.execute_write(process_in_batches, create_nodes_query, nodes, BATCH_SIZE_NODES)

#             # Insert edges in batches
#             session.execute_write(process_in_batches, create_edges_query, edges, BATCH_SIZE_EDGES)

#             # Set displayName and labels
#             session.run(set_displayname_and_labels_query)

#     except Exception as e:
#         print(f"Error occurred: {e}")

#     finally:
#         driver.close()

# if __name__ == "__main__":
#     main()