In [1]:
from IPython.core.display import HTML
HTML("""
<style>
figcaption {
  color: #4e181b;
  font-style: italic;
  font-size: 16px;
  padding: 0px;
  text-align: center;
}
</style>
<center 
">
<img width="50%" src="SemanticSearch.png?w=200">
<br/>
<a href="https://www.kaggle.com/oluwadaunsid" style="color: white;
background-color: #2b4b82;
border-radius: 25px;
padding: 1rem 1.5rem;
text-decoration: none;
">@daunsid</a>
</center>

""")

In [1]:
# import data preprocessing libraries

import os
import re

import pickle
import pyarrow
import pandas as pd
import numpy as np
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import (
    AutoModel, AutoConfig, 
    AutoTokenizer, logging,
)

from sentence_transformers import util

In [40]:
CONFIG = {
    "model_name": "sentence-transformers/all-MiniLM-L6-v2",# "distilbert-base-uncased",
    "model_name_path":"../information_retrieval/weights/model",
    "device": 'cuda' if torch.cuda.is_available() else 'cpu',
    "max_length": 256,
}

config = AutoConfig.from_pretrained(CONFIG["model_name_path"])
#model = AutoModel.from_pretrained(CONFIG["model_name"], config=config)
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name_path"])

In [3]:
# set the path to the dataset to the variable name 'IMG_PATH'
DATA_PATH = '../data/raw/Drugs Master List.csv'


drugs_df = pd.read_csv(DATA_PATH)
print("DRUGS DATA SHAPE", drugs_df.shape)

DRUGS DATA SHAPE (999, 17)


In [4]:
fields = ['drug_name', 'medical_condition', 'side_effects', 'generic_name',
            'drug_classes', 'brand_names', 'activity', 'rx_otc',
            'pregnancy_category', 'csa', 'alcohol', 'related_drugs']

In [6]:
# 

sample1 = drugs_df[drugs_df['drug_name']=='doxycycline']
print(f"drug_name :{sample1['drug_name'][0]}\n\n \
side_effects:{sample1['side_effects'][0]}\n\n ")

drug_name :doxycycline

 side_effects:(hives, difficult breathing, swelling in your face or throat) or a severe skin reaction (fever, sore throat, burning in your eyes, skin pain, red or purple skin rash that spreads and causes blistering and peeling). Seek medical treatment if you have a serious drug reaction that can affect many parts of your body. Symptoms may include: skin rash, fever, swollen glands, flu-like symptoms, muscle aches, severe weakness, unusual bruising, or yellowing of your skin or eyes. This reaction may occur several weeks after you began using doxycycline. Doxycycline may cause serious side effects. Call your doctor at once if you have: severe stomach pain, diarrhea that is watery or bloody; throat irritation, trouble swallowing; chest pain, irregular heart rhythm, feeling short of breath; little or no urination; low white blood cell counts - fever, chills, swollen glands, body aches, weakness, pale skin, easy bruising or bleeding; severe headaches, ringing in you

In [5]:
# let see the different features availbale in our data

features = {i:col for i, col in enumerate(drugs_df.columns)}
print(f"Number of Features: {len(features)}")


Number of Features: 17


In [6]:
def to_string(series):
    sentence = ''
    for words in series:
        sentence += words+"\n"
    sentence = sentence.strip()
    return sentence

class PreprocessPipeLine:
    
    def __call__(self, df):
        """
        drop redundant features not necesaary for retrieval system
        `data_df.drop(list of columns to drop, axis=1)`
        """
        df = df.drop(['rating','no_of_reviews',
                      'drug_link','medical_condition_url',
                      'medical_condition_description'],
                     axis=1)
        # replace null values with the string unknown
        df = df.fillna('unknown')
        
        df['related_drugs'] = df['related_drugs'].apply(lambda z: self.remove_url_char(z))
        
        # explode `side_effects` column
        df['side_effects'] = df['side_effects'].apply(lambda z: z.split('.'))
        df = df.explode('side_effects', ignore_index=True)
        
        # drop rows with empty side_effects
        df['string_length'] = df['side_effects'].apply(lambda z: len(z))
        df = df[df['string_length']>0]
        df.index = [i for i in range(len(df))]
        df = df.drop('string_length', axis=1)
        return df
    
    def remove_url_char(self, feature):
        # clean related drugs:
            #remove unwanted url links from 
            #remove characters '|' and spaces
        url_cleaner = re.compile(r":|https://\S+|www\.\S+")
        feature = url_cleaner.sub(r'', feature)
        feature = feature.strip().replace(r'  | ', ', ')
        return feature

In [7]:
class DrugsInformation(torch.utils.data.Dataset):
    def __init__(self, drugs):
        #super(DrugsInformation, self).__init__()
        self.drugs = drugs
        
    def __getitem__(self, idx):
        if isinstance(self.drugs, pd.DataFrame):
            drug = self.drugs.loc[idx]
                    #drug_info = self.drugs.loc[idx]
        else:
            drug = self.drugs[idx]
        
        drug= to_string(drug)   
        return drug
    def __len__(self):
        return len(self.drugs)


In [8]:
preprocess = PreprocessPipeLine()

drugs_df = preprocess(drugs_df)
drugs_data = DrugsInformation(drugs_df)

In [41]:
# save the data in parquet format

def save_data(data:pd.DataFrame, dataset_file:str):
    if not os.path.exists(dataset_file):
        data.to_parquet(dataset_file,
                        engine='pyarrow',
                        index=False)
        print(f'data successfully saved in {dataset_file}')
save_data(drugs_df, '../data/processed/drug.parquet')

data successfully saved in ../data/processed/drug.parquet


In [53]:
data_00 = drugs_data[6934]

In [54]:
print(data_00)
ttxts=tokenizer(data_00)
tokenizer.decode(ttxts['input_ids'])

Histafed
Colds & Flu
 Common side effects of Histafed may include: dizziness, drowsiness; dry mouth, nose, or throat; constipation ; blurred vision; or feeling restless or excited (especially in children)
pseudoephedrine and triprolidine
Upper respiratory combinations
Aphedrid, A-Phedrin, Aprodine, Vi-Sudo
0%
Rx/OTC
C
N
X
unknown


'[CLS] histafed colds & flu common side effects of histafed may include : dizziness, drowsiness ; dry mouth, nose, or throat ; constipation ; blurred vision ; or feeling restless or excited ( especially in children ) pseudoephedrine and triprolidine upper respiratory combinations aphedrid, a - phedrin, aprodine, vi - sudo 0 % rx / otc c n x unknown [SEP]'

In [9]:
dataloader = torch.utils.data.DataLoader(drugs_data, batch_size=100)

In [10]:
def get_tokenizer(texts):
    tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name_path"])
    encoded_texts = tokenizer(
        texts,add_special_tokens=True,padding='max_length',
        max_length=int(CONFIG["max_length"]),
        truncation=True,
        return_tensors='pt',
        #return_attention_mask=True
    )                          
    return encoded_texts

In [11]:
class Encoder(nn.Module):
    def __init__(self, model_name_path):
        super(Encoder, self).__init__()
        self.config = AutoConfig.from_pretrained(model_name_path)
        self.model = AutoModel.from_pretrained(model_name_path, from_tf=True, config=self.config)

    def forward(self, **encoded_inputs):
        out = self.model(**encoded_inputs)
        
        return out

In [12]:

    
class MeanPooling(nn.Module) :
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, model_output, attention_mask) -> torch.tensor :

        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask =  torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

def drugs_information_retrieval(texts, embeddings):
    inputs = to_string(texts)
    inputs = get_tokenizer(texts)
    
    prediction = get_embeddings(inputs).detach().cpu()
    #outputs = F.cosine_similarity(prediction, embeddings)
    embeddings = embeddings.detach().cpu()
    outputs = util.semantic_search(prediction, embeddings, top_k=1)[0][0]
    
    return outputs
    

In [13]:

def get_embeddings(encoded_inputs):
    encoded_inputs = {k: v.to(CONFIG["device"]) for k, v in encoded_inputs.items()}
    with torch.no_grad():
        model = Encoder(CONFIG["model_name_path"]).to(CONFIG["device"])
        model_output = model(**encoded_inputs)
        
    pooler = MeanPooling()
    embeddings = pooler(model_output, encoded_inputs['attention_mask'])
    embeddings = F.normalize(embeddings)
    return embeddings

In [14]:
#import warnings
#warnings.filterwarnings("ignore")

list_embeddings = []
for dl in dataloader:
    enc_inputs = get_tokenizer(dl)
    embedding = get_embeddings(enc_inputs)
    
    list_embeddings.append(embedding)


All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your 

All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your 

All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your 

In [37]:
embeddings = torch.cat(list_embeddings)

In [38]:
EMBD_PATH = '../information_retrieval/weights/embd/embeddings.pickle'
with open(EMBD_PATH, 'wb') as f: 
    pickle.dump(embeddings, f)
    f.close()    

In [44]:
PROCESS_DATA_PATH="../data/processed/drug.parquet"
def get_DI(idx):
    information = DrugsInformation(pd.read_parquet(PROCESS_DATA_PATH))[idx]
    return information
tx = "I need information about Histafed"

output = drugs_information_retrieval(tx, embeddings)
idx, score = output['corpus_id'], output['score']
drug_info = get_DI(idx).split('\n')
results = {field:info
           for field, info in zip(fields, drug_info)
          }

All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


In [45]:
results

{'drug_name': 'Histafed',
 'medical_condition': 'Colds & Flu',
 'side_effects': ' Common side effects of Histafed may include: dizziness, drowsiness; dry mouth, nose, or throat; constipation ; blurred vision; or feeling restless or excited (especially in children)',
 'generic_name': 'pseudoephedrine and triprolidine',
 'drug_classes': 'Upper respiratory combinations',
 'brand_names': 'Aphedrid, A-Phedrin, Aprodine, Vi-Sudo',
 'activity': '0%',
 'rx_otc': 'Rx/OTC',
 'pregnancy_category': 'C',
 'csa': 'N',
 'alcohol': 'X',
 'related_drugs': 'unknown'}