In [None]:
# Load libraries
import pinecone
from tqdm.autonotebook import tqdm

import openai
import pandas as pd
import numpy as np
import re
import os
from transformers import pipeline
import textwrap
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read your OpenAI and Pinecone keys from the environment

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV = os.getenv('PINECONE_API_ENV')

openai.api_key = OPENAI_API_KEY

In [None]:
# initialize pinecone

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

index_name = "medical-qa-search"
index = pinecone.Index(index_name)

In [None]:
# Read the medical diagnostic data into a dataframe

df = pd.read_csv("/mnt/code/data/disease_components.csv")
df.head()

In [None]:
dfFillNaN = df.fillna("none")
dfFillNaN.head()

In [None]:
find = ["\[","\]","\'","\"","\,"]
replace = ['','','','','']

dfCleaned = dfFillNaN.replace(find, replace, regex=True)
dfCleaned.head()

In [None]:
# Data tansformation with DCA - removing NaN values from data
df_sym = df.loc[df["Symptoms"].notna()]
df_ovr = df_sym.loc[df_sym["Overview"].notna()]
df_causes = df_ovr.loc[df_ovr["Causes"].notna()]
df_rf = df_causes.loc[df_causes["Risk factors"].notna()]
df_ppNaN = df_rf.loc[df_rf["diagnosis"].notna()]
df_ppNaN

In [None]:
# Helper function to summarize large column to conform to Pinecone metadata upsert limits
# Initialize summarization pipeline
summary = pipeline('summarization')
def summarize_text(column_text):
  if len(column_text) > 10000:
    final_list = []
    chunks =  textwrap.wrap(column_text,1000)
    for chunk in chunks:
      final_list.append(summary(chunk)[0]['summary_text'])
    final_summary = " ".join(final_list)
    return final_summary
  else:
    return column_text

In [None]:
# Summarize large columns
dfCleaned['Symptoms'] = dfCleaned['Symptoms'].map(summarize_text)
dfCleaned['Overview'] = dfCleaned['Overview'].map(summarize_text)
dfCleaned['Causes'] = dfCleaned['Causes'].map(summarize_text)
dfCleaned['Risk factors'] = dfCleaned['Risk factors'].map(summarize_text)
dfCleaned['diagnosis'] = dfCleaned['diagnosis'].map(summarize_text)

In [None]:

dfCleaned.head()


In [None]:
# Batch for embedding and upsert into Pinecone  
model = "text-embedding-ada-002"
embeddings = []
texts = []
BATCH_SIZE = 32
for batch_start in range(0, len(dfCleaned), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    #get batch id
    batch_ids  = [str(batch_start) for batch_start in range(batch_start,batch_end)]
    symptoms_batch = dfCleaned['Symptoms'].values.tolist()[batch_start:batch_end]
    link_batch = dfCleaned['link'].values.tolist()[batch_start:batch_end]
    name_batch = dfCleaned['name'].values.tolist()[batch_start:batch_end]
    overview_batch = dfCleaned['Overview'].values.tolist()[batch_start:batch_end]
    causes_batch = dfCleaned['Causes'].values.tolist()[batch_start:batch_end]
    risk_factors_batch = dfCleaned['Risk factors'].values.tolist()[batch_start:batch_end]
    diagnosis_batch = dfCleaned['diagnosis'].values.tolist()[batch_start:batch_end]
    batch_text = [{'name':name,'symptoms':symptoms,'overview':overview,'causes':causes,'riskf':risk_factors,'diagnosis':diagnosis,'link':link} for name,symptoms,overview,causes,risk_factors,diagnosis,link in zip(name_batch,symptoms_batch,overview_batch,causes_batch,risk_factors_batch,diagnosis_batch,link_batch)]
    print(f"Batch {batch_start} to {batch_end-1}")
    #response = client.embeddings.create(model=model, input=batch_text)
    response = openai.Embedding.create(input=symptoms_batch,engine=model)
    for i, be in enumerate(response.data):
        assert i == be.index  # double check embeddings are in same order as input
    #batch_embeddings = [e.embedding for e in response.data]
    batch_embeddings = [ record['embedding'] for record in response['data']] 
    embeddings.extend(batch_embeddings)
    texts.extend(batch_text)
    #upsert to pinecone
    to_upsert = zip(batch_ids,batch_embeddings,batch_text)
    index.upsert(vectors=list(to_upsert))
    
index.describe_index_stats()    
dfUpsert = pd.DataFrame({"text": texts, "embedding": embeddings})

In [None]:
index.describe_index_stats()

In [None]:
dfUpsert.head()