In [3]:
# Load libraries
import pinecone
from tqdm.autonotebook import tqdm
#from openai import OpenAI
import openai
import pandas as pd
import numpy as np
import re
import os
from transformers import pipeline
import textwrap
import warnings
warnings.filterwarnings('ignore')

In [15]:
# Read your OpenAI and Pinecone keys from the environment
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV = os.getenv('PINECONE_API_ENV')

openai.api_key = OPENAI_API_KEY

In [5]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

index_name = "medical-qa-search"
index = pinecone.Index(index_name)

In [6]:
# Read the medical diagnostic data into a dataframe

df = pd.read_csv("/mnt/code/data/disease_components.csv")
df.head()

Unnamed: 0,name,link,Symptoms,Overview,Causes,Risk factors,diagnosis
0,Acanthosis nigricans,https://www.mayoclinic.org/diseases-conditions...,"[""Skin changes are the only signs of acanthosi...","['', 'Acanthosis nigricans is a skin condition...",['Acanthosis nigricans has been associated wit...,"['Acanthosis nigricans risk factors include:',...",['Acanthosis nigricans is typically detected d...
1,Achalasia,https://www.mayoclinic.org/diseases-conditions...,['Achalasia symptoms generally appear graduall...,['Achalasia is a rare disorder that makes it d...,['The exact cause of achalasia is poorly under...,,['Achalasia can be overlooked or misdiagnosed ...
2,Achilles tendon rupture,https://www.mayoclinic.org/diseases-conditions...,"[""Although it's possible to have no signs or s...","['', 'The Achilles tendon is a strong fibrous ...",['Your Achilles tendon helps you point your fo...,['Factors that may increase your risk of Achil...,"['During the physical exam, your doctor will i..."
3,Acute coronary syndrome,https://www.mayoclinic.org/diseases-conditions...,['The signs and symptoms of acute coronary syn...,['Acute coronary syndrome is a term used to de...,['Acute coronary syndrome usually results from...,['The risk factors for acute coronary syndrome...,['If you have signs or symptoms associated wit...
4,Adenomyosis,https://www.mayoclinic.org/diseases-conditions...,"['Sometimes, adenomyosis causes no signs or sy...","['', 'With adenomyosis, the same tissue that l...","[""The cause of adenomyosis isn't known. There ...","['Risk factors for adenomyosis include:', 'Mos...",['Some other uterine conditions can cause sign...


In [7]:
dfFillNaN = df.fillna("none")
dfFillNaN.head()

Unnamed: 0,name,link,Symptoms,Overview,Causes,Risk factors,diagnosis
0,Acanthosis nigricans,https://www.mayoclinic.org/diseases-conditions...,"[""Skin changes are the only signs of acanthosi...","['', 'Acanthosis nigricans is a skin condition...",['Acanthosis nigricans has been associated wit...,"['Acanthosis nigricans risk factors include:',...",['Acanthosis nigricans is typically detected d...
1,Achalasia,https://www.mayoclinic.org/diseases-conditions...,['Achalasia symptoms generally appear graduall...,['Achalasia is a rare disorder that makes it d...,['The exact cause of achalasia is poorly under...,none,['Achalasia can be overlooked or misdiagnosed ...
2,Achilles tendon rupture,https://www.mayoclinic.org/diseases-conditions...,"[""Although it's possible to have no signs or s...","['', 'The Achilles tendon is a strong fibrous ...",['Your Achilles tendon helps you point your fo...,['Factors that may increase your risk of Achil...,"['During the physical exam, your doctor will i..."
3,Acute coronary syndrome,https://www.mayoclinic.org/diseases-conditions...,['The signs and symptoms of acute coronary syn...,['Acute coronary syndrome is a term used to de...,['Acute coronary syndrome usually results from...,['The risk factors for acute coronary syndrome...,['If you have signs or symptoms associated wit...
4,Adenomyosis,https://www.mayoclinic.org/diseases-conditions...,"['Sometimes, adenomyosis causes no signs or sy...","['', 'With adenomyosis, the same tissue that l...","[""The cause of adenomyosis isn't known. There ...","['Risk factors for adenomyosis include:', 'Mos...",['Some other uterine conditions can cause sign...


In [8]:
find = ["\[","\]","\'","\"","\,"]
replace = ['','','','','']

dfCleaned = dfFillNaN.replace(find, replace, regex=True)
dfCleaned.head()

Unnamed: 0,name,link,Symptoms,Overview,Causes,Risk factors,diagnosis
0,Acanthosis nigricans,https://www.mayoclinic.org/diseases-conditions...,Skin changes are the only signs of acanthosis ...,Acanthosis nigricans is a skin condition that...,Acanthosis nigricans has been associated with:...,Acanthosis nigricans risk factors include: ...,Acanthosis nigricans is typically detected dur...
1,Achalasia,https://www.mayoclinic.org/diseases-conditions...,Achalasia symptoms generally appear gradually ...,Achalasia is a rare disorder that makes it dif...,The exact cause of achalasia is poorly underst...,none,Achalasia can be overlooked or misdiagnosed be...
2,Achilles tendon rupture,https://www.mayoclinic.org/diseases-conditions...,Although its possible to have no signs or symp...,The Achilles tendon is a strong fibrous cord ...,Your Achilles tendon helps you point your foot...,Factors that may increase your risk of Achille...,During the physical exam your doctor will insp...
3,Acute coronary syndrome,https://www.mayoclinic.org/diseases-conditions...,The signs and symptoms of acute coronary syndr...,Acute coronary syndrome is a term used to desc...,Acute coronary syndrome usually results from t...,The risk factors for acute coronary syndrome a...,If you have signs or symptoms associated with ...
4,Adenomyosis,https://www.mayoclinic.org/diseases-conditions...,Sometimes adenomyosis causes no signs or sympt...,With adenomyosis the same tissue that lines t...,The cause of adenomyosis isnt known. There hav...,Risk factors for adenomyosis include: Most cas...,Some other uterine conditions can cause signs ...


In [9]:
# Helper function to summarize large column to conform to Pinecone metadata upsert limits
# Initialize summarization pipeline
summary = pipeline('summarization')
def summarize_text(column_text):
  if len(column_text) > 10000:
    final_list = []
    chunks =  textwrap.wrap(column_text,1000)
    for chunk in chunks:
      final_list.append(summary(chunk)[0]['summary_text'])
    final_summary = " ".join(final_list)
    return final_summary
  else:
    return column_text

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [10]:
# Summarize large columns
dfCleaned['Symptoms'] = dfCleaned['Symptoms'].map(summarize_text)
dfCleaned['Overview'] = dfCleaned['Overview'].map(summarize_text)
dfCleaned['Causes'] = dfCleaned['Causes'].map(summarize_text)
dfCleaned['Risk factors'] = dfCleaned['Risk factors'].map(summarize_text)
dfCleaned['diagnosis'] = dfCleaned['diagnosis'].map(summarize_text)

Your max_length is set to 142, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)
Your max_length is set to 142, but your input_length is only 8. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


In [11]:

dfCleaned.head()


Unnamed: 0,name,link,Symptoms,Overview,Causes,Risk factors,diagnosis
0,Acanthosis nigricans,https://www.mayoclinic.org/diseases-conditions...,Skin changes are the only signs of acanthosis ...,Acanthosis nigricans is a skin condition that...,Acanthosis nigricans has been associated with:...,Acanthosis nigricans risk factors include: ...,Acanthosis nigricans is typically detected dur...
1,Achalasia,https://www.mayoclinic.org/diseases-conditions...,Achalasia symptoms generally appear gradually ...,Achalasia is a rare disorder that makes it dif...,The exact cause of achalasia is poorly underst...,none,Achalasia can be overlooked or misdiagnosed be...
2,Achilles tendon rupture,https://www.mayoclinic.org/diseases-conditions...,Although its possible to have no signs or symp...,The Achilles tendon is a strong fibrous cord ...,Your Achilles tendon helps you point your foot...,Factors that may increase your risk of Achille...,During the physical exam your doctor will insp...
3,Acute coronary syndrome,https://www.mayoclinic.org/diseases-conditions...,The signs and symptoms of acute coronary syndr...,Acute coronary syndrome is a term used to desc...,Acute coronary syndrome usually results from t...,The risk factors for acute coronary syndrome a...,If you have signs or symptoms associated with ...
4,Adenomyosis,https://www.mayoclinic.org/diseases-conditions...,Sometimes adenomyosis causes no signs or sympt...,With adenomyosis the same tissue that lines t...,The cause of adenomyosis isnt known. There hav...,Risk factors for adenomyosis include: Most cas...,Some other uterine conditions can cause signs ...


In [23]:
# Batch for embedding and upsert into Pinecone  
model = "text-embedding-ada-002"
embeddings = []
texts = []
BATCH_SIZE = 32
for batch_start in range(0, len(dfCleaned), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    #get batch id
    batch_ids  = [str(batch_start) for batch_start in range(batch_start,batch_end)]
    symptoms_batch = dfCleaned['Symptoms'].values.tolist()[batch_start:batch_end]
    link_batch = dfCleaned['link'].values.tolist()[batch_start:batch_end]
    name_batch = dfCleaned['name'].values.tolist()[batch_start:batch_end]
    overview_batch = dfCleaned['Overview'].values.tolist()[batch_start:batch_end]
    causes_batch = dfCleaned['Causes'].values.tolist()[batch_start:batch_end]
    risk_factors_batch = dfCleaned['Risk factors'].values.tolist()[batch_start:batch_end]
    diagnosis_batch = dfCleaned['diagnosis'].values.tolist()[batch_start:batch_end]
    batch_text = [{'name':name,'symptoms':symptoms,'overview':overview,'causes':causes,'riskf':risk_factors,'diagnosis':diagnosis,'link':link} for name,symptoms,overview,causes,risk_factors,diagnosis,link in zip(name_batch,symptoms_batch,overview_batch,causes_batch,risk_factors_batch,diagnosis_batch,link_batch)]
    print(f"Batch {batch_start} to {batch_end-1}")
    #response = client.embeddings.create(model=model, input=batch_text)
    response = openai.Embedding.create(input=symptoms_batch,engine=model)
    for i, be in enumerate(response.data):
        assert i == be.index  # double check embeddings are in same order as input
    #batch_embeddings = [e.embedding for e in response.data]
    batch_embeddings = [ record['embedding'] for record in response['data']] 
    embeddings.extend(batch_embeddings)
    texts.extend(batch_text)
    #upsert to pinecone
    to_upsert = zip(batch_ids,batch_embeddings,batch_text)
    index.upsert(vectors=list(to_upsert))
    
index.describe_index_stats()    
dfUpsert = pd.DataFrame({"text": texts, "embedding": embeddings})

Batch 0 to 31
Batch 32 to 63
Batch 64 to 95
Batch 96 to 127
Batch 128 to 159
Batch 160 to 191
Batch 192 to 223
Batch 224 to 255
Batch 256 to 287
Batch 288 to 319
Batch 320 to 351
Batch 352 to 383
Batch 384 to 415
Batch 416 to 447
Batch 448 to 479
Batch 480 to 511
Batch 512 to 543
Batch 544 to 575
Batch 576 to 607
Batch 608 to 639
Batch 640 to 671
Batch 672 to 703
Batch 704 to 735
Batch 736 to 767
Batch 768 to 799
Batch 800 to 831
Batch 832 to 863
Batch 864 to 895
Batch 896 to 927
Batch 928 to 959
Batch 960 to 991
Batch 992 to 1023
Batch 1024 to 1055
Batch 1056 to 1087
Batch 1088 to 1119
Batch 1120 to 1151
Batch 1152 to 1183


In [24]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.01152,
 'namespaces': {'': {'vector_count': 1152}},
 'total_vector_count': 1152}

In [25]:
dfUpsert.head()

Unnamed: 0,text,embedding
0,"{'name': 'Acanthosis nigricans', 'symptoms': '...","[0.0018646994139999151, -0.005439898930490017,..."
1,"{'name': 'Achalasia', 'symptoms': 'Achalasia s...","[0.007332390174269676, 0.009587530046701431, 0..."
2,"{'name': 'Achilles tendon rupture', 'symptoms'...","[-0.004021851345896721, -0.006759998854249716,..."
3,"{'name': 'Acute coronary syndrome', 'symptoms'...","[-0.014695934019982815, -0.007088254205882549,..."
4,"{'name': 'Adenomyosis', 'symptoms': 'Sometimes...","[-0.024685872718691826, -0.024969616904854774,..."
