In [16]:
from IPython.core.display import HTML
HTML("""
<style>
figcaption {
  color: #4e181b;
  font-style: italic;
  font-size: 16px;
  padding: 0px;
  text-align: center;
}
</style>
<center 
">
<img width="50%" src="SemanticSearch.png?w=200">

<a href="https://www.kaggle.com/oluwadaunsid" style="color: white;
background-color: #2b4b82;
border-radius: 25px;
padding: 1rem 1.5rem;
text-decoration: none;
">@daunsid</a>
</center>

""")

In [17]:
# import data preprocessing libraries
import PIL
import os
import re


import pyarrow
import pandas as pd
import numpy as np
import seaborn as sns

import torch
import torch.nn as nn
import transformers
from transformers import (
    AutoModel, AutoConfig, 
    AutoTokenizer, logging,
    AdamW, get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    Trainer, TrainingArguments
)

from wordcloud import WordCloud

In [19]:
CONFIG = {
    "model_name": "sentence-transformers/all-MiniLM-L6-v2",# "distilbert-base-uncased",
    "model_name_path":"../information_retrieval/weights",
    "device": 'cuda' if torch.cuda.is_available() else 'cpu',
    "max_length": 512,
}

config = AutoConfig.from_pretrained(CONFIG["model_name_path"])
#model = AutoModel.from_pretrained(CONFIG["model_name"], config=config)
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name_path"])

In [91]:
# set the path to the dataset to the variable name 'IMG_PATH'
DATA_PATH = '../data/raw/Drugs Master List.csv'

drugs_df = pd.read_csv(DATA_PATH)
print("DRUGS DATA SHAPE", drugs_df.shape)

DRUGS DATA SHAPE (999, 17)


In [92]:
# let see the different features availbale in our data

features = {i:col for i, col in enumerate(drugs_df.columns)}

"""
drop redundant features not necesaary for retrieval system
`data_df.drop(list of columns to drop, axis=1)`
"""
drugs_df = drugs_df.drop(['rating','no_of_reviews',
                        'drug_link','medical_condition_url',
                        'medical_condition_description'],
                       axis=1)
# 
drugs_df = drugs_df.fillna('unknown')

In [7]:
# 

sample1 = drugs_df[drugs_df['drug_name']=='doxycycline']
print(f"drug_name :{sample1['drug_name'][0]}\n\n \
side_effects:{sample1['side_effects'][0]}\n\n ")

drug_name :doxycycline

 side_effects:(hives, difficult breathing, swelling in your face or throat) or a severe skin reaction (fever, sore throat, burning in your eyes, skin pain, red or purple skin rash that spreads and causes blistering and peeling). Seek medical treatment if you have a serious drug reaction that can affect many parts of your body. Symptoms may include: skin rash, fever, swollen glands, flu-like symptoms, muscle aches, severe weakness, unusual bruising, or yellowing of your skin or eyes. This reaction may occur several weeks after you began using doxycycline. Doxycycline may cause serious side effects. Call your doctor at once if you have: severe stomach pain, diarrhea that is watery or bloody; throat irritation, trouble swallowing; chest pain, irregular heart rhythm, feeling short of breath; little or no urination; low white blood cell counts - fever, chills, swollen glands, body aches, weakness, pale skin, easy bruising or bleeding; severe headaches, ringing in you

In [None]:
def to_string(series):
    sentence = ''
    for word in series:
        sentence += word+' '
    sentence = sentence.strip()
    return sentence

In [None]:
class DrugsInformation(torch.utils.data.Dataset):
    def __init__(self, drugs, tokenizer, transforms=None, inference=False):
        self.drugs = drugs
        self.tokenizer = tokenizer
        self.transforms = transform
        self.max_seq_length = int(config["max_length"])
        self.inference = inference
        
    def __getitem__(self, idx):
        
        if not self.inference:
            drugs_info = self.drugs.apply(self.transforms)
            
        if isinstance(self.drugs, pd.DataFrame):
            drugs_info = self.drugs.loc[idx]
            drug_info = to_string(drugs_info)
        else:
            drug_info = self.drugs[idx]
            
        tokens = self.tokenizer(
            drug_info,
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_seq_length,
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )
        result = {
            'input_ids': tokens['input_ids'].to(device).squeeze(),
            'attention_mask': tokens['attention_mask'].to(device).squeeze()
        }
        return result

    def __len__(self):
        return len(self.drugs)


In [93]:
def preprocess(data_df):
    
    
    # clean related drugs
    #remove unwanted url links from 
    feature = 'related_drugs'
    url_cleaner = re.compile(r":|https://\S+|www\.\S+")
    data_df[feature] = url_cleaner.sub(r'', data_df[feature])
    
    data_df[feature] = data_df[feature].strip().replace(r'  | ', ', ')

In [94]:
drugs_df.apply(preprocess).info()

KeyError: 'related_drugs'

In [None]:
# data cleaning

In [10]:
# Its is seen that some of the columns contains NAN Values
#Droping the null values will significantly reduce the number of data samples to 147 
print(len(drugs_df.dropna()))

#The second option is to fill null values with 1) additional data 2) generic information
#We will be replacing null values with a generic values of 'unknown' 



147


In [11]:
# clean "related_drugs" feature column 




In [12]:
drugs_df['related_drugs'][900]

'amoxicillin, prednisone, ciprofloxacin, azithromycin, fluticasone nasal, montelukast, cetirizine, clindamycin, Augmentin, promethazine'

In [44]:
# save the data in parquet format

def save_data(data:pd.DataFrame, dataset_file:str):
    if not os.path.exists(dataset_file):
        data.to_parquet(dataset_file,
                        engine='pyarrow',
                        index=False)
        print(f'data successfully saved in {dataset_file}')
save_data(drugs_df, '../data/processed/drug.parquet')

data successfully saved in ../data/processed/drug.parquet


In [4]:
#from sentence_transformers import SentenceTransformer, util
#model = SentenceTransformer('all-MiniLM-L12-v2')

from transformers import AutoTokenizer, TFAutoModel, AutoModel

#model_ckpt = "sentence-transformers/all-MiniLM-L6-v2"
#tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
#model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True)

In [40]:
#help(TFAutoModel.from_pretrained)

In [48]:
tokenizer.save_pretrained("../information_retrieval/weights")
model.save_pretrained("../information_retrieval/weights")

In [2]:
#help(TFAutoModel.from_pretrained)