In [42]:
# Load libraries
import pinecone
from tqdm.autonotebook import tqdm
from openai import OpenAI
import pandas as pd
import numpy as np
import re
import os
from transformers import pipeline
import textwrap

In [43]:
# Read your OpenAI and Pinecone keys from the environment
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV = os.getenv('PINECONE_API_ENV')

In [3]:
# this code is generated by the Domino Code Assist toolbar button
import domino_code_assist as dca
dca.init()

In [44]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)
index_name = "medical-qa-search"
index = pinecone.Index(index_name)

In [45]:
# Read the medical diagnostic data into a dataframe

df = pd.read_csv("/mnt/code/data/disease_components.csv")
df.head()

Unnamed: 0,name,link,Symptoms,Overview,Causes,Risk factors,diagnosis
0,Acanthosis nigricans,https://www.mayoclinic.org/diseases-conditions...,"[""Skin changes are the only signs of acanthosi...","['', 'Acanthosis nigricans is a skin condition...",['Acanthosis nigricans has been associated wit...,"['Acanthosis nigricans risk factors include:',...",['Acanthosis nigricans is typically detected d...
1,Achalasia,https://www.mayoclinic.org/diseases-conditions...,['Achalasia symptoms generally appear graduall...,['Achalasia is a rare disorder that makes it d...,['The exact cause of achalasia is poorly under...,,['Achalasia can be overlooked or misdiagnosed ...
2,Achilles tendon rupture,https://www.mayoclinic.org/diseases-conditions...,"[""Although it's possible to have no signs or s...","['', 'The Achilles tendon is a strong fibrous ...",['Your Achilles tendon helps you point your fo...,['Factors that may increase your risk of Achil...,"['During the physical exam, your doctor will i..."
3,Acute coronary syndrome,https://www.mayoclinic.org/diseases-conditions...,['The signs and symptoms of acute coronary syn...,['Acute coronary syndrome is a term used to de...,['Acute coronary syndrome usually results from...,['The risk factors for acute coronary syndrome...,['If you have signs or symptoms associated wit...
4,Adenomyosis,https://www.mayoclinic.org/diseases-conditions...,"['Sometimes, adenomyosis causes no signs or sy...","['', 'With adenomyosis, the same tissue that l...","[""The cause of adenomyosis isn't known. There ...","['Risk factors for adenomyosis include:', 'Mos...",['Some other uterine conditions can cause sign...


In [46]:
# Data tansformation with DCA - removing NaN values from data
df_sym = df.loc[df["Symptoms"].notna()]
df_ovr = df_sym.loc[df_sym["Overview"].notna()]
df_causes = df_ovr.loc[df_ovr["Causes"].notna()]
df_rf = df_causes.loc[df_causes["Risk factors"].notna()]
df_ppNaN = df_rf.loc[df_rf["diagnosis"].notna()]
df_ppNaN

Unnamed: 0,name,link,Symptoms,Overview,Causes,Risk factors,diagnosis
0,Acanthosis nigricans,https://www.mayoclinic.org/diseases-conditions...,"[""Skin changes are the only signs of acanthosi...","['', 'Acanthosis nigricans is a skin condition...",['Acanthosis nigricans has been associated wit...,"['Acanthosis nigricans risk factors include:',...",['Acanthosis nigricans is typically detected d...
2,Achilles tendon rupture,https://www.mayoclinic.org/diseases-conditions...,"[""Although it's possible to have no signs or s...","['', 'The Achilles tendon is a strong fibrous ...",['Your Achilles tendon helps you point your fo...,['Factors that may increase your risk of Achil...,"['During the physical exam, your doctor will i..."
3,Acute coronary syndrome,https://www.mayoclinic.org/diseases-conditions...,['The signs and symptoms of acute coronary syn...,['Acute coronary syndrome is a term used to de...,['Acute coronary syndrome usually results from...,['The risk factors for acute coronary syndrome...,['If you have signs or symptoms associated wit...
4,Adenomyosis,https://www.mayoclinic.org/diseases-conditions...,"['Sometimes, adenomyosis causes no signs or sy...","['', 'With adenomyosis, the same tissue that l...","[""The cause of adenomyosis isn't known. There ...","['Risk factors for adenomyosis include:', 'Mos...",['Some other uterine conditions can cause sign...
5,Adjustment disorders,https://www.mayoclinic.org/diseases-conditions...,['Signs and symptoms depend on the type of adj...,['Adjustment disorders are stress-related cond...,['Adjustment disorders are caused by significa...,['Some things may make you more likely to have...,['Diagnosis of adjustment disorders is based o...
...,...,...,...,...,...,...,...
1176,Yellow fever,https://www.mayoclinic.org/diseases-conditions...,"[""During the first three to six days after you...",['Yellow fever is a viral infection spread by ...,['Yellow fever is caused by a virus that is sp...,['You may be at risk of the disease if you tra...,['Diagnosing yellow fever based on signs and s...
1177,"Yersinia pestis (See: Yersinia pestis, also kn...",https://www.mayoclinic.org/diseases-conditions...,['Plague is divided into three main types — bu...,"[""Plague is a serious bacterial infection that...","['The plague bacteria, Yersinia pestis, is tra...",['The risk of developing plague is very low. W...,"['If your doctor suspects plague, he or she ma..."
1179,Zika virus,https://www.mayoclinic.org/diseases-conditions...,['As many as 4 out of 5 people infected with t...,['The mosquito that carries the Zika virus is ...,['The Zika virus is transmitted primarily thro...,['Factors that put you at greater risk of deve...,['The mosquito that carries the Zika virus is ...
1180,Zollinger-Ellison syndrome,https://www.mayoclinic.org/diseases-conditions...,['Signs and symptoms of Zollinger-Ellison synd...,['Zollinger-Ellison syndrome is a rare conditi...,"['', 'The pancreas is a large organ that lies ...","[""If you have a blood relative, such as a sibl...",['Your doctor will base a diagnosis on the fol...


In [48]:
# Helper function to summarize large column to conform to Pinecone metadata upsert limits
# Initialize summarization pipeline
summary = pipeline('summarization')
def summarize_text(column_text):
  if len(column_text) > 10000:
    final_list = []
    chunks =  textwrap.wrap(column_text,1000)
    for chunk in chunks:
      final_list.append(summary(chunk)[0]['summary_text'])
    final_summary = " ".join(final_list)
    return final_summary
  else:
    return column_text

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [49]:
# Summarize large columns
df_ppNaN['Symptoms'] = df_ppNaN['Symptoms'].map(summarize_text)
df_ppNaN['Overview'] = df_ppNaN['Overview'].map(summarize_text)
df_ppNaN['Causes'] = df_ppNaN['Causes'].map(summarize_text)
df_ppNaN['Risk factors'] = df_ppNaN['Risk factors'].map(summarize_text)
df_ppNaN['diagnosis'] = df_ppNaN['diagnosis'].map(summarize_text)

Your max_length is set to 142, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Your max_length is set to 142, but your input_length is only 52. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. su

In [50]:

df_ppNaN.head()


Unnamed: 0,name,link,Symptoms,Overview,Causes,Risk factors,diagnosis
0,Acanthosis nigricans,https://www.mayoclinic.org/diseases-conditions...,"[""Skin changes are the only signs of acanthosi...","['', 'Acanthosis nigricans is a skin condition...",['Acanthosis nigricans has been associated wit...,"['Acanthosis nigricans risk factors include:',...",['Acanthosis nigricans is typically detected d...
2,Achilles tendon rupture,https://www.mayoclinic.org/diseases-conditions...,"[""Although it's possible to have no signs or s...","['', 'The Achilles tendon is a strong fibrous ...",['Your Achilles tendon helps you point your fo...,['Factors that may increase your risk of Achil...,"['During the physical exam, your doctor will i..."
3,Acute coronary syndrome,https://www.mayoclinic.org/diseases-conditions...,['The signs and symptoms of acute coronary syn...,['Acute coronary syndrome is a term used to de...,['Acute coronary syndrome usually results from...,['The risk factors for acute coronary syndrome...,['If you have signs or symptoms associated wit...
4,Adenomyosis,https://www.mayoclinic.org/diseases-conditions...,"['Sometimes, adenomyosis causes no signs or sy...","['', 'With adenomyosis, the same tissue that l...","[""The cause of adenomyosis isn't known. There ...","['Risk factors for adenomyosis include:', 'Mos...",['Some other uterine conditions can cause sign...
5,Adjustment disorders,https://www.mayoclinic.org/diseases-conditions...,['Signs and symptoms depend on the type of adj...,['Adjustment disorders are stress-related cond...,['Adjustment disorders are caused by significa...,['Some things may make you more likely to have...,['Diagnosis of adjustment disorders is based o...


In [53]:
# Batch for embedding and upsert into Pinecone  
embeddings = []
content = []
BATCH_SIZE = 32
for batch_start in range(0, len(df_ppNaN), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    #get batch id
    batch_ids  = [str(batch_start) for batch_start in range(batch_start,batch_end)]
    symptoms_batch = df_ppNaN['Symptoms'].values.tolist()[batch_start:batch_end]
    link_batch = df_ppNaN['link'].values.tolist()[batch_start:batch_end]
    name_batch = df_ppNaN['name'].values.tolist()[batch_start:batch_end]
    overview_batch = df_ppNaN['Overview'].values.tolist()[batch_start:batch_end]
    causes_batch = df_ppNaN['Causes'].values.tolist()[batch_start:batch_end]
    risk_factors_batch = df_ppNaN['Risk factors'].values.tolist()[batch_start:batch_end]
    diagnosis_batch = df_ppNaN['diagnosis'].values.tolist()[batch_start:batch_end]
    batch_content = [{'symptoms':symptoms,'overview':overview,'causes':causes,'riskf':risk_factors,'diagnosis':diagnosis,'name':name,'url':link} for symptoms,overview,causes,risk_factors,diagnosis,name,link in zip(symptoms_batch,overview_batch,causes_batch,risk_factors_batch,diagnosis_batch,name_batch,link_batch)]
    print(f"Batch {batch_start} to {batch_end-1}")
    response = client.embeddings.create(model=model, input=symptoms_batch)
    for i, be in enumerate(response.data):
        assert i == be.index  # double check embeddings are in same order as input
    batch_embeddings = [e.embedding for e in response.data]
    embeddings.extend(batch_embeddings)
    content.extend(batch_content)
    #upsert to pinecone
    to_upsert = zip(batch_ids,batch_embeddings,batch_content)
    index.upsert(vectors=list(to_upsert))
    
index.describe_index_stats()    
dfUpsert = pd.DataFrame({"content": content, "embedding": embeddings})

Batch 0 to 31
Batch 32 to 63
Batch 64 to 95
Batch 96 to 127
Batch 128 to 159
Batch 160 to 191
Batch 192 to 223
Batch 224 to 255
Batch 256 to 287
Batch 288 to 319
Batch 320 to 351
Batch 352 to 383
Batch 384 to 415
Batch 416 to 447
Batch 448 to 479
Batch 480 to 511
Batch 512 to 543
Batch 544 to 575
Batch 576 to 607
Batch 608 to 639
Batch 640 to 671
Batch 672 to 703
Batch 704 to 735
Batch 736 to 767
Batch 768 to 799
Batch 800 to 831
Batch 832 to 863
Batch 864 to 895
Batch 896 to 927


In [54]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00909,
 'namespaces': {'': {'vector_count': 909}},
 'total_vector_count': 909}

In [55]:
dfUpsert.head()

Unnamed: 0,content,embedding
0,"{'symptoms': '[""Skin changes are the only sign...","[0.0010588630102574825, -0.008330042473971844,..."
1,"{'symptoms': '[""Although it's possible to have...","[-0.0017860711086541414, -0.007644194643944502..."
2,{'symptoms': '['The signs and symptoms of acut...,"[-0.013818983919918537, -0.004287558142095804,..."
3,"{'symptoms': '['Sometimes, adenomyosis causes ...","[-0.02348223328590393, -0.026248667389154434, ..."
4,{'symptoms': '['Signs and symptoms depend on t...,"[-0.01782934181392193, 0.006055957637727261, 0..."
