In [2]:
!pip install langchain
!pip install langchain_community
!pip install openai
!pip install tiktoken
!pip install sentence_transformers

Collecting langchain
  Downloading langchain-0.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_core-0.3.2-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.125-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4.0,>=0.3.0->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting httpx<1,>=0.23.0 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.7-cp310-cp310-ma

In [3]:
!pip install sentence-transformers qdrant_client


Collecting qdrant_client
  Downloading qdrant_client-1.11.2-py3-none-any.whl.metadata (10 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant_client)
  Downloading grpcio_tools-1.66.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant_client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-tools>=1.41.0->qdrant_client)
  Downloading protobuf-5.28.2-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting grpcio>=1.41.0 (from qdrant_client)
  Downloading grpcio-1.66.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting h2<5,>=3 (from httpx[http2]>=0.20.0->qdrant_client)
  Downloading h2-4.1.0-py3-none-any.whl.metadata (3.6 kB)
Collecting hyperframe<7,>=6.0 (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant_client)
  Downloading hyperframe-6.0.1-py3-none-any.whl.metadata (2.7 kB)
Collecting hpack<5,>=4.0 (fro

In [4]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams

# Load a pre-trained model for sentence embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example original dataset with conceptually related terms
original_data = [
    {"data_id": 1, "description": "alcohol abuse", "category": "Substance Use Disorder"},
    {"data_id": 2, "description": "nicotine addiction", "category": "Substance Use Disorder"},

    {"data_id": 3, "description": "heart attack", "category": "Cardiovascular Health"},
    {"data_id": 4, "description": "cardiomyopathy", "category": "Cardiovascular Health"},

    {"data_id": 5, "description": "diabetes", "category": "Diabetes"},
    {"data_id": 6, "description": "hypertension", "category": "Diabetes"},

    {"data_id": 7, "description": "depression", "category": "Mental Health"},
    {"data_id": 8, "description": "anxiety", "category": "Mental Health"},

    {"data_id": 9, "description": "arthritis", "category": "Arthritis"},
    {"data_id": 10, "description": "joint pain", "category": "Arthritis"},

    {"data_id": 11, "description": "asthma", "category": "Asthma"},
    {"data_id": 12, "description": "chronic asthma", "category": "Asthma"},

    {"data_id": 13, "description": "chronic kidney disease", "category": "Chronic Kidney Disease"},
    {"data_id": 14, "description": "kidney stones", "category": "Chronic Kidney Disease"},

    {"data_id": 15, "description": "chronic respiratory disease", "category": "Chronic Respiratory Disease"},
    {"data_id": 16, "description": "pneumonia", "category": "Chronic Respiratory Disease"},

    {"data_id": 17, "description": "chronic liver disease", "category": "Chronic Liver Disease"},
    {"data_id": 18, "description": "gallstones", "category": "Chronic Liver Disease"},

    {"data_id": 19, "description": "chronic kidney disease", "category": "Chronic Kidney Disease"},
    {"data_id": 20, "description": "kidney stones", "category": "Chronic Kidney Disease"},

    {"data_id": 21, "description": "lack of transportation", "category": "Socioeconomic Determinants of Health"},
    {"data_id": 22, "description": "lack of healthcare access", "category": "Socioeconomic Determinants of Health"},

    {"data_id": 23, "description": "lack of exercise", "category": "Socioeconomic Determinants of Health"},
    {"data_id": 24, "description": "lack of physical activity", "category": "Socioeconomic Determinants of Health"}

    # Add more entries...
]

# Initialize Qdrant client
client = QdrantClient(":memory:")

# Define the vector dimension
vector_dimension = model.get_sentence_embedding_dimension()

# Create the collection before inserting points
client.recreate_collection(
    collection_name="data_mapping",
    vectors_config=VectorParams(size=vector_dimension, distance="Cosine")
)

# Create embeddings and store them in Qdrant
for data in original_data:
    embedding = model.encode(data["description"]).tolist()  # Convert to list for Qdrant
    point = PointStruct(
        id=data["data_id"],
        vector=embedding,
        payload={"category": data["category"]}
    )
    client.upsert(
        collection_name="data_mapping",
        points=[point]
    )


  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  client.recreate_collection(


In [5]:
# Example new dataset with conceptually related descriptions
new_data = [
    {"data_id": 101, "description": "cocaine dependence"},
    {"data_id": 102, "description": "heroin use"},
    {"data_id": 103, "description": "alcohol addiction"},
    {"data_id": 104, "description": "travel to clinic"},
    {"data_id": 105, "description": "excess sugar"},
    # Add more entries...
]

# Embed new dataset and query Qdrant for the closest match
for data in new_data:
    new_embedding = model.encode(data["description"]).tolist()

    # Search in Qdrant for the most similar embedding
    result = client.search(
        collection_name="data_mapping",
        query_vector=new_embedding,
        limit=1  # Get the closest match
    )

    if result:
        closest_match = result[0]
        data["category"] = closest_match.payload["category"]
        print(f"Assigned category for '{data['description']}': {data['category']}")


Assigned category for 'cocaine dependence': Substance Use Disorder
Assigned category for 'heroin use': Substance Use Disorder
Assigned category for 'alcohol addiction': Substance Use Disorder
Assigned category for 'travel to clinic': Socioeconomic Determinants of Health
Assigned category for 'excess sugar': Diabetes


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Assuming we have embeddings of original and new data
def compute_similarity(embedding1, embedding2):
    return cosine_similarity([embedding1], [embedding2])[0][0]

# Thresholds for similarity categories
def categorize_similarity(similarity_score):
    if similarity_score > 0.9:
        return "equivalent"
    elif similarity_score > 0.7:
        return "related"
    elif similarity_score > 0.4:
        return "not-so-related"
    else:
        return "not related at all"


In [7]:
!pip install langchain_openai

Collecting langchain_openai
  Downloading langchain_openai-0.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading langchain_openai-0.2.0-py3-none-any.whl (51 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.5/51.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain_openai
Successfully installed langchain_openai-0.2.0


In [8]:
from langchain_openai import ChatOpenAI
from google.colab import userdata

# userdata.get('secretName')
# Initialize the ChatOpenAI model (GPT-4)
llm = ChatOpenAI(model="gpt-4", openai_api_key=userdata.get('OPENAI_API_KEY'))

# Function to categorize the relationship between two descriptions
def llm_categorize_relationship(description1, description2):
    prompt = f"""
    Given the following two descriptions:
    1. "{description1}"
    2. "{description2}"

    Categorize the relationship between these descriptions into one of the following categories:

    - Equivalent: The descriptions refer to the same concept or are nearly identical in meaning.
    - Related: The descriptions refer to closely related concepts, even though they are not exactly the same.
    - Not-so-related: The descriptions share some commonality but refer to largely different concepts or contexts.
    - Not related at all: The descriptions refer to completely different concepts or contexts with no meaningful connection.

    Based on the content of the descriptions and the provided criteria, return the most appropriate category.
    """

    # Prepare the input for the chat model
    input_messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]

    # Invoke the model with the input messages
    response = llm.invoke(input_messages)

    # Since response is an AIMessage object, access the 'content' attribute directly
    result_category = response.content.strip()

    # Format the output as "description1 - description2 - result_category"
    return f"{description1} - {description2} - {result_category}"

# Example descriptions
description1 = "alcohol abuse"
description2 = "addictive personality"

# Ask LLM to categorize the relationship between the descriptions
structured_output = llm_categorize_relationship(description1, description2)
print(structured_output)


alcohol abuse - addictive personality - Related


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from langchain_openai import ChatOpenAI

# Initialize the ChatOpenAI model (GPT-4)
llm = ChatOpenAI(model="gpt-4", openai_api_key=userdata.get('OPENAI_API_KEY'))

# Function to compute cosine similarity between two embeddings
def compute_similarity(embedding1, embedding2):
    return cosine_similarity([embedding1], [embedding2])[0][0]

# Function to categorize the relationship based on the similarity score
def categorize_similarity(similarity_score):
    if similarity_score > 0.9:
        return "equivalent"
    elif similarity_score > 0.7:
        return "related"
    elif similarity_score > 0.4:
        return "not-so-related"
    else:
        return "not related at all"

# Function to categorize the relationship between two descriptions using LLM and similarity score
def llm_categorize_relationship(description1, description2, similarity_score):
    # Categorize based on the similarity score first
    result_category = categorize_similarity(similarity_score)

    # Optionally, you can ask the LLM to refine the categorization using the context of the descriptions
    prompt = f"""
    Given the following two descriptions:
    1. "{description1}"
    2. "{description2}"

    Based on the similarity score of {similarity_score}, and the following category criteria:

    - Equivalent: The descriptions refer to the same concept or are nearly identical in meaning.
    - Related: The descriptions refer to closely related concepts, even though they are not exactly the same.
    - Not-so-related: The descriptions share some commonality but refer to largely different concepts or contexts.
    - Not related at all: The descriptions refer to completely different concepts or contexts with no meaningful connection.

    The current categorization based on the similarity score is "{result_category}". Refine or confirm this categorization based on the content of the descriptions.
    """

    # Prepare the input for the chat model
    input_messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]

    # Invoke the model with the input messages
    response = llm.invoke(input_messages)

    # Get the LLM's refined category
    refined_category = response.content.strip()

    # Return the structured output: description1 - description2 - result_category - similarity_score
    return f"{description1} - {description2} - {refined_category} - Similarity score: {similarity_score:.2f}"

# Example usage
# Example embeddings (replace these with actual embeddings from your embedding model)
embedding1 = np.random.rand(768)  # Simulating a 768-dimensional embedding for description1
embedding2 = np.random.rand(768)  # Simulating a 768-dimensional embedding for description2

# Descriptions
description1 = "alcohol abuse"
description2 = "anxiety disorder"

# Compute similarity score
similarity_score = compute_similarity(embedding1, embedding2)

# Ask LLM to categorize the relationship and generate the structured output
structured_output = llm_categorize_relationship(description1, description2, similarity_score)
print(structured_output)


alcohol abuse - anxiety disorder - The categorization of "related" is accurate. While "alcohol abuse" and "anxiety disorder" are not the same, they are closely related concepts in the field of mental health. Often, individuals suffering from anxiety disorders may turn to substances such as alcohol as a form of self-medication, leading to abuse. Conversely, chronic alcohol abuse can also lead to the development of anxiety disorders. Therefore, these two descriptions are indeed related. - Similarity score: 0.74
