In [1]:
! pip install datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 KB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 KB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting fsspec[http]<=2024.2.0,>=2023.1.0
  Downloading fsspec-2024.2.0-py3-none-any.whl (170 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.9/170.9 KB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting dill<0.3.9,>=0.3.0
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 KB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=12.0.0
  Downloading pyarrow-15.0.2-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)
[2K

In [7]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("medmcqa", split="train")


In [4]:
import pprint as pp
pp.pprint(dataset)

Dataset({
    features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
    num_rows: 182822
})


In [5]:
dataset[0]

{'id': 'e9ad821a-c438-4965-9f77-760819dfa155',
 'question': 'Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma',
 'opa': 'Hyperplasia',
 'opb': 'Hyperophy',
 'opc': 'Atrophy',
 'opd': 'Dyplasia',
 'cop': 2,
 'choice_type': 'single',
 'exp': 'Chronic urethral obstruction because of urinary calculi, prostatic hyperophy, tumors, normal pregnancy, tumors, uterine prolapse or functional disorders cause hydronephrosis which by definition is used to describe dilatation of renal pelvis and calculus associated with progressive atrophy of the kidney due to obstruction to the outflow of urine Refer Robbins 7yh/9,1012,9/e. P950',
 'subject_name': 'Anatomy',
 'topic_name': 'Urinary tract'}

In [8]:
df = pd.DataFrame(dataset)
df.isna().sum()
# Columns exp and topic_name contain missing values, we will drop rows with missing values in both columns
df = df.dropna(subset=["exp"])
df.isna().sum()
df = df.dropna(subset=["topic_name"])
df.isna().sum()

id              0
question        0
opa             0
opb             0
opc             0
opd             0
cop             0
choice_type     0
exp             0
subject_name    0
topic_name      0
dtype: int64

In [10]:
df.head()

Unnamed: 0,id,question,opa,opb,opc,opd,cop,choice_type,exp,subject_name,topic_name
0,e9ad821a-c438-4965-9f77-760819dfa155,Chronic urethral obstruction due to benign pri...,Hyperplasia,Hyperophy,Atrophy,Dyplasia,2,single,Chronic urethral obstruction because of urinar...,Anatomy,Urinary tract
1,e3d3c4e1-4fb2-45e7-9f88-247cc8f373b3,Which vitamin is supplied from only animal sou...,Vitamin C,Vitamin B7,Vitamin B12,Vitamin D,2,single,Ans. (c) Vitamin B12 Ref: Harrison's 19th ed. ...,Biochemistry,Vitamins and Minerals
2,5c38bea6-787a-44a9-b2df-88f4218ab914,All of the following are surgical options for ...,Adjustable gastric banding,Biliopancreatic diversion,Duodenal Switch,Roux en Y Duodenal By pass,3,multi,"Ans. is 'd' i.e., Roux en Y Duodenal Bypass Ba...",Surgery,Surgical Treatment Obesity
7,f3bf8583-231b-4b7a-828c-179b0f9ccdd9,Per rectum examination is not a useful test fo...,Anal fissure,Hemorrhoid,Pilonidal sinus,Rectal ulcer,2,single,PILONIDAL SINUS/DISEASE (Jeep Bottom; Driver's...,Surgery,Urology
10,b3d1eb6c-3909-4011-ad10-d55538b81456,Naglers reaction is shown by,Clostridium tetani,Clostridium botulinum,Clostridium perfringens,Clostridium septicum,2,single,Nagler's reaction - when Clostridium perfringe...,Microbiology,Bacteriology


In [9]:
#converting the pre-processed dataframe back to dataset
dataset = dataset.from_pandas(df)

# Now import chromadb, instantiate a chroma client and create a collection using the function create_collection, give it a name. 
# you can specify the embedding function to be used as a parameter in the create_collection(), if you don't, a default technique is used.
import chromadb
client = chromadb.Client()
collection = client.create_collection("MedicalQA")

In [11]:
# Now we will store the expert explanation field of first 10 questions from dataset into collection. 
collection.add(
    ids=[str(i) for i in range(0, 10)],  # every entry in collection requires an ID, ID is a string
    documents=dataset["exp"][:10],# explanation field text, for first 10 questions only
    metadatas=[{"type": "exp"} for _ in range(0, 10)],) #metadata, additional information about the documents

/home/ubuntu/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:59<00:00, 1.40MiB/s]


In [12]:
collection.peek() # peek() returns a list of the first 10 items in the collection

{'ids': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
 'embeddings': [[0.01779644936323166,
   0.0227197352796793,
   0.07299382239580154,
   -0.0252540186047554,
   -0.05387658625841141,
   -0.05940738320350647,
   -0.02634953148663044,
   0.07464782148599625,
   0.02873389795422554,
   -0.026618773117661476,
   -0.0490587055683136,
   0.029021862894296646,
   0.037506792694330215,
   0.008624416776001453,
   -0.06284338980913162,
   -0.05127063766121864,
   -0.0997946485877037,
   -0.0007943143718875945,
   -0.011782648041844368,
   0.006948122754693031,
   0.10289480537176132,
   0.053156785666942596,
   -0.006918002385646105,
   0.0018158932216465473,
   -0.08616229891777039,
   -0.04031231254339218,
   0.041093695908784866,
   0.008172915317118168,
   0.028126630932092667,
   0.028244545683264732,
   -0.0507260262966156,
   0.035944774746894836,
   -0.11390071362257004,
   -0.04947488754987717,
   0.012679235078394413,
   0.05435965955257416,
   -0.05475325882434845,
   0.04