### Download the data

In [1]:
data_dir = "/home/jovyan/Works/Practice/dataset/drugscom"

In [None]:

!wget -P $data_dir "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"

--2025-05-08 22:40:52--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘/home/jovyan/Works/Practice/dataset/drugscom/drugsCom_raw.zip’

drugsCom_raw.zip        [         <=>        ]  41.00M  22.1MB/s    in 1.9s    

2025-05-08 22:40:58 (22.1 MB/s) - ‘/home/jovyan/Works/Practice/dataset/drugscom/drugsCom_raw.zip’ saved [42989872]



In [2]:
!unzip "$data_dir/drugsCom_raw.zip" -d "$data_dir"


Archive:  /home/jovyan/Works/Practice/dataset/drugscom/drugsCom_raw.zip
  inflating: /home/jovyan/Works/Practice/dataset/drugscom/drugsComTest_raw.tsv  
  inflating: /home/jovyan/Works/Practice/dataset/drugscom/drugsComTrain_raw.tsv  


In [2]:
# Load the data from saved directory

from datasets import load_dataset

data_files = {
    "train":  f"{data_dir}/drugsComTrain_raw.tsv",
    "test":f"{data_dir}/drugsComTest_raw.tsv"
}

drug_dataset = load_dataset("csv",data_files=data_files, delimiter="\t")
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [3]:
# random sample from the dataset
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[:2]

{'Unnamed: 0': [87571, 178045],
 'drugName': ['Naproxen', 'Duloxetine'],
 'condition': ['Gout, Acute', 'ibromyalgia'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."'],
 'rating': [9.0, 3.0],
 'date': ['September 2, 2015', 'November 7, 2011'],
 'usefulCount': [36, 13]}

### Some data visualization

In [4]:
for split in drug_dataset.keys():
    # assert len(drug_dataset[split]) == len (drug_dataset[split].unique("Unnamed: 0"))
    
    dataset_split = drug_dataset[split]
    
    # Total number of rows in this split
    total_rows = len(dataset_split)
    
    # Get unique values in the "Unnamed: 0" column
    unique_ids = dataset_split.unique("Unnamed: 0")
    total_unique_ids = len(unique_ids)
    
    
    print(f"\nSplit: {split}")
    print(f"Total rows: {total_rows}")
    print(f"Unique 'Unnamed: 0' values: {total_unique_ids}")    


Split: train
Total rows: 161297
Unique 'Unnamed: 0' values: 161297

Split: test
Total rows: 53766
Unique 'Unnamed: 0' values: 53766


In [3]:
# rename the "Unnamed 0" column with patient_d
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)

drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [4]:
drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'Left Ventricular Dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27}

In [5]:
# Uniqu number of drugs and conditions in the training dataset

unique_drugs= len(drug_dataset["train"].unique("drugName"))
print(f"Unique drugs count {unique_drugs}")

unique_conditions = len(drug_dataset["train"].unique("condition"))
print(f"Unique conditions: {unique_conditions}")

Unique drugs count 3436
Unique conditions: 885


In [6]:
# Lower the 'condition' column

def lower_condition(example):
    return {"condition": example["condition"].lower()}

In [7]:
drug_dataset = drug_dataset.map(lower_condition)

Map:   0%|          | 0/161297 [00:00<?, ? examples/s]

AttributeError: 'NoneType' object has no attribute 'lower'

In [8]:
# Filter the none types
def filter_nones(x):
    return x["condition"] is not None

In [9]:
drug_dataset = drug_dataset.filter(filter_nones)

In [10]:
drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'Left Ventricular Dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27}

In [11]:
# Now apply the lower function
drug_dataset = drug_dataset.map(lower_condition)

In [12]:
drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27}

### Creating new column

In [13]:
# Approach 1
def review_length(example):
    return {"review_length": len(example["review"].split())}

In [14]:
drug_dataset = drug_dataset.map(review_length)

In [15]:
drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [16]:
# sort column
drug_dataset = drug_dataset.sort("review_length")
drug_dataset["train"][:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [17]:
# Filter dataset based on review length
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)

In [18]:
drug_dataset.num_rows

{'train': 138514, 'test': 46108}

In [19]:
# sort based on review length in reverse
drug_dataset =  drug_dataset.sort("review_length", reverse=True)

In [20]:
drug_dataset["train"][0]

{'patient_id': 121004,
 'drugName': 'Venlafaxine',
 'condition': 'migraine',
 'review': '"Two and a half months ago I was prescribed Venlafaxine to help prevent chronic migraines.\r\nIt did help the migraines (reduced them by almost half), but with it came a host of side effects that were far worse than the problem I was trying to get rid of.\r\nHaving now come off of the stuff, I would not recommend anyone ever use Venlafaxine unless they suffer from extreme / suicidal depression. I mean extreme in the most emphatic sense of the word. \r\nBefore trying Venlafaxine, I was a writer. While on Venlafaxine, I could barely write or speak or communicate at all. More than that, I just didn&#039;t want to. Not normal for a usually outgoing extrovert.\r\nNow, I&#039;m beginning to write again - but my ability to speak and converse with others has deteriorated by about 95%. Writing these words is taking forever; keeping up in conversation with even one person is impossible, and I barely see the 

Performance Enhancement Related

In [21]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [24]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 17min 43s, sys: 11.7 s, total: 17min 54s
Wall time: 36 s


In [20]:
# By default AutoTokenizer use fast tokenization using Rust based implementation
# We can replace it with slow tokenizer as below
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)


def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [32]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 2min 36s, sys: 354 ms, total: 2min 36s
Wall time: 2min 36s


#### But with slow tokenizer still performance gain is possible using multi processing

In [None]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True, num_proc=8)

Map (num_proc=8):   0%|          | 0/138514 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 1.29 s, sys: 459 ms, total: 1.75 s
Wall time: 31.3 s


In [34]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True, num_proc=2)

Map (num_proc=2):   0%|          | 0/138514 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 1.85 s, sys: 316 ms, total: 2.16 s
Wall time: 1min 44s


#### Dealing with tokenized inputs

In [22]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )

In [23]:
result = tokenize_and_split(drug_dataset["train"][0])
# [len(inp) for inp in result["input_ids"]]
# {k: len(v) for k, v in result.items()}

In [24]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

ArrowInvalid: Column 8 named input_ids expected length 1000 but got length 2646

In [26]:
# solving the problem by removing colum names
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, 
    batched=True, 
    remove_columns=drug_dataset["train"].column_names
)

In [38]:
len(tokenized_dataset["train"]), len(drug_dataset["train"])

(212993, 138514)

#### Technique to keep the old columns

In [27]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )
    
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [28]:
tokenized_dataset =  drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 212993
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 70952
    })
})

#### Interoperability with pandas

In [58]:
drug_dataset.set_format("pandas")

In [60]:
drug_dataset["train"]

Dataset({
    features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
    num_rows: 160398
})

In [61]:
# create panda dataframe from the dataset
train_df = drug_dataset["train"][:]

In [62]:
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index":"condition","count":"frequency"})
)

In [63]:
frequencies.head()

Unnamed: 0,condition,frequency
0,birth control,28788
1,depression,9069
2,pain,6145
3,anxiety,5904
4,acne,5588


In [67]:
# Creating new dataset from the dataframe
from datasets import Dataset
freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['condition', 'frequency'],
    num_rows: 884
})

In [121]:
avg_rating_per_drug = (
    train_df[["drugName", "rating"]]
    .groupby("drugName")["rating"]
    .mean()
    .to_frame()
    .reset_index()
)

avg_rating_per_drug

Unnamed: 0,drugName,rating
0,A + D Cracked Skin Relief,10.000000
1,A / B Otic,10.000000
2,Abacavir / dolutegravir / lamivudine,8.211538
3,Abacavir / lamivudine / zidovudine,9.000000
4,Abatacept,7.157895
...,...,...
3426,Zyvox,9.000000
3427,ZzzQuil,2.500000
3428,depo-subQ provera 104,1.000000
3429,ella,6.980392


In [123]:
rating_dataset = Dataset.from_pandas(avg_rating_per_drug)
rating_dataset[:3]

{'drugName': ['A + D Cracked Skin Relief',
  'A / B Otic',
  'Abacavir / dolutegravir / lamivudine'],
 'rating': [10.0, 10.0, 8.211538461538462]}

#### Creating validation set

In [16]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 53471
    })
})

In [19]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)

# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")

# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]

In [20]:
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 128318
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 32080
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 53471
    })
})

In [22]:
# Saving the dataset in local machine

drug_dataset_clean.save_to_disk(f"{data_dir}/drug-reviews")

Saving the dataset (0/1 shards):   0%|          | 0/128318 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/32080 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/53471 [00:00<?, ? examples/s]

In [23]:
# Load data from the loacl machine

from datasets import load_from_disk

drug_dataset_clean = load_from_disk(f"{data_dir}/drug-reviews")

In [24]:
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 128318
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 32080
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 53471
    })
})

In [26]:
drug_dataset_clean["train"][0]

{'patient_id': 36220,
 'drugName': 'Imiquimod',
 'condition': 'human papilloma virus',
 'review': '"I have had HPV for YEARS! Its a horrible, relentless and unflattering disease and it will not go away. Aldara seems to do absolutely nothing, and the doctor that has seen my private parts more than any sexual partner (solely due to the HPV) doesn&#039;t really seem to know how the virus works. I go for freezing once every couple of months or so and they just grow back. With the Aldara it just seems to be all for nothing and sets your hopes up to be dashed. I really wish I was as fortunate as the other reviewers on this site but I can&#039;t say that Aldara works at all! :-("',
 'rating': 1.0,
 'date': 'September 13, 2016',
 'usefulCount': 6,
 'review_length': 112}