In [1]:
# !wget -P ../data/drugs-com "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
# !unzip ../data/drugs-com/drugsCom_raw.zip -d ../data/drugs-com

In [76]:
import html
import multiprocessing
from datasets import Dataset
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer

In [3]:
num_cores = multiprocessing.cpu_count()
num_cores_avail = max(1, num_cores - 1)

# Load dataset

In [4]:
data_dir_path = "../data/drugs-com"

In [5]:
data_files = {"train": f"{data_dir_path}/drugsComTrain_raw.tsv", "test": f"{data_dir_path}/drugsComTest_raw.tsv"}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [6]:
drug_dataset["train"]

Dataset({
    features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
    num_rows: 161297
})

## Take a random sample to get a feel for the data

In [7]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

## Quality checks and data cleaning

In [8]:
# Is Unnamed: 0 a unique identifier?
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [9]:
drug_dataset = drug_dataset.rename_column("Unnamed: 0", new_column_name="patient_id")

In [10]:
drug_dataset = drug_dataset.rename_columns({key: key.lower() for key in drug_dataset["train"].column_names})

In [11]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount'],
        num_rows: 53766
    })
})

In [12]:
len(drug_dataset["train"].unique("drugname"))

3436

## Clean labels

In [13]:
def lowercase_condition(example, key):
    return {key: [x.lower() for x in example[key]]}

def filter_nones(example):
    return [x is not None for x in example["condition"]]

drug_dataset = (
    drug_dataset
        .filter(filter_nones, batched=True, num_proc=num_cores_avail)
        .map(lambda x: lowercase_condition(x, "condition"), batched=True, num_proc=num_cores_avail)
)

In [14]:
drug_dataset["train"]["condition"][:3]

['left ventricular dysfunction', 'adhd', 'birth control']

In [15]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount'],
        num_rows: 53471
    })
})

## Clean data

### Get rough estimate of review lengths

In [16]:
drug_dataset = drug_dataset.map(
    lambda x: {"review_length_est": [len(r.split(' ')) for r in x["review"]]},
    batched=True,
    num_proc=num_cores_avail
)

In [17]:
drug_dataset["train"][0]

{'patient_id': 206461,
 'drugname': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulcount': 27,
 'review_length_est': 17}

In [18]:
drug_dataset["train"].sort("review_length_est")[:3]["review_length_est"]

[1, 1, 1]

In [19]:
drug_dataset["train"].sort("review_length_est", reverse=True)[:3]["review_length_est"]

[1857, 1222, 1107]

In [20]:
drug_dataset = drug_dataset.filter(
    lambda x: [rle > 30 for rle in x["review_length_est"]],
    batched=True,
    num_proc=num_cores_avail
)

In [21]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount', 'review_length_est'],
        num_rows: 138903
    })
    test: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount', 'review_length_est'],
        num_rows: 46246
    })
})

### Fix HTML character codes

In [22]:
text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [23]:
# drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})
drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(r) for r in x["review"]]},
    batched=True,
    num_proc=num_cores_avail
)

In [24]:
drug_dataset.column_names

{'train': ['patient_id',
  'drugname',
  'condition',
  'review',
  'rating',
  'date',
  'usefulcount',
  'review_length_est'],
 'test': ['patient_id',
  'drugname',
  'condition',
  'review',
  'rating',
  'date',
  'usefulcount',
  'review_length_est']}

# Tokenize

In [25]:
checkpoint = "bert-base-cased"

In [26]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
slow_tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=False)

In [27]:
def tokenize_function(tokenizer, examples):
    return tokenizer(examples["review"], truncation=True)

## Single-processing

In [28]:
# tokenized_dataset = drug_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=True)

In [29]:
# tokenized_dataset = drug_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=False)

## Multi-processing

In [30]:
tokenized_dataset = drug_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=True, num_proc=num_cores_avail)

In [31]:
# tokenized_dataset = drug_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=False, num_proc=num_cores_avail)

# Try using slow tokenizer
Don't bother doing it single-threaded, since it would take forever

In [32]:
# tokenized_dataset = drug_dataset.map(lambda x: tokenize_function(slow_tokenizer, x), batched=True, num_proc=num_cores_avail)

# Use Tokenizer

In [33]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )

Multiple "features" for this example since there's overflow (sequences longer than max length)

In [34]:
res = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in res.input_ids]

[128, 49]

## Handle mismatched length through removal

In [35]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split,
    batched=True,
    num_proc=num_cores_avail,
    remove_columns=drug_dataset["train"].column_names
)

In [36]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 207161
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 69014
    })
})

Compare dataset lengths -- the dataset that allows for overflow should be longer (examples split up into multiple examples)

In [37]:
len(tokenized_dataset["train"]), len(drug_dataset["train"])

(207161, 138903)

## Handle mismatched length through repetition

In [38]:
def tokenize_and_split_rep(examples):
    res = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )
    # Extract mapping between new and old indices
    sample_map = res.pop("overflow_to_sample_mapping")
    # Duplicate entries for individuals
    for key, values in examples.items():
        res[key] = [values[i] for i in sample_map]
    return res

In [39]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split_rep,
    batched=True,
    num_proc=num_cores_avail
)

In [40]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount', 'review_length_est', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 207161
    })
    test: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount', 'review_length_est', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 69014
    })
})

In [41]:
print(tokenized_dataset["train"][0]["patient_id"])
print(tokenized_dataset["train"][1]["patient_id"])

95260
95260


In [42]:
print(len(tokenized_dataset["train"][0]["input_ids"]))
print(len(tokenized_dataset["train"][1]["input_ids"]))

128
49


# Data conversions

In [43]:
drug_dataset.set_format("pandas")

In [44]:
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugname,condition,review,rating,date,usefulcount,review_length_est
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,133
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [45]:
train_df = drug_dataset["train"][:]

In [46]:
train_df.head()

Unnamed: 0,patient_id,drugname,condition,review,rating,date,usefulcount,review_length_est
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,133
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89
3,35696,Buprenorphine / naloxone,opiate dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37,134
4,155963,Cialis,benign prostatic hyperplasia,"""2nd day on 5mg started to work with rock hard...",2.0,"November 28, 2015",43,70


In [47]:
frequencies_df = (
    train_df["condition"]
        .value_counts()
        .to_frame()
        .reset_index()
)
frequencies_df.head()

Unnamed: 0,condition,count
0,birth control,27671
1,depression,8043
2,acne,5212
3,anxiety,5001
4,pain,4756


In [48]:
freq_dataset = Dataset.from_pandas(frequencies_df)
freq_dataset

Dataset({
    features: ['condition', 'count'],
    num_rows: 819
})

In [49]:
avg_ratings_df = (
    train_df["condition"]
)

In [65]:
train_df.groupby("drugname", as_index=False).agg({"rating": "mean"}).sort_values("rating", ascending=False)
# train_df.groupby("drugname")["drugname"]

Unnamed: 0,drugname,rating
0,A + D Cracked Skin Relief,10.0
516,Carbinoxamine,10.0
2430,Ruconest,10.0
1615,Lonox,10.0
2431,Rufinamide,10.0
...,...,...
2961,Xiaflex,1.0
2960,Xgeva,1.0
269,Atovaquone,1.0
1161,FluMist Quadrivalent,1.0


In [66]:
drug_dataset.reset_format()

# Make a validation dataset

In [73]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
drug_dataset_clean["test"] = drug_dataset["test"]

In [74]:
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount', 'review_length_est'],
        num_rows: 111122
    })
    validation: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount', 'review_length_est'],
        num_rows: 27781
    })
    test: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount', 'review_length_est'],
        num_rows: 46246
    })
})

# Arrow data format

## Save

In [75]:
drug_dataset_clean.save_to_disk("../data/drugs-com/drug-reviews-cleaned-split")

Saving the dataset (0/1 shards):   0%|          | 0/111122 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/27781 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/46246 [00:00<?, ? examples/s]

## Load

In [79]:
drug_dataset_clean_loaded = load_from_disk("../data/drugs-com/drug-reviews-cleaned-split")

In [80]:
drug_dataset_clean_loaded

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount', 'review_length_est'],
        num_rows: 111122
    })
    validation: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount', 'review_length_est'],
        num_rows: 27781
    })
    test: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount', 'review_length_est'],
        num_rows: 46246
    })
})

# CSV/JSON data format

## Save

In [88]:
json_data_files = {split: f"../data/drugs-com/drug-reviews-cleaned-split-json/drug-reviews-{split}.jsonl" for split in drug_dataset_clean.keys()}

In [89]:
json_data_files

{'train': '../data/drugs-com/drug-reviews-cleaned-split-json/drug-reviews-train.jsonl',
 'validation': '../data/drugs-com/drug-reviews-cleaned-split-json/drug-reviews-validation.jsonl',
 'test': '../data/drugs-com/drug-reviews-cleaned-split-json/drug-reviews-test.jsonl'}

In [90]:
for split, dataset in drug_dataset_clean.items():
    # Store each row in the dataset as a single line of JSON
    dataset.to_json(json_data_files[split])

Creating json from Arrow format:   0%|          | 0/112 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

In [91]:
!head -n 1 "../data/drugs-com/drug-reviews-cleaned-split-json/drug-reviews-train.jsonl"

{"patient_id":99950,"drugname":"Magnesium sulfate \/ potassium sulfate \/ sodium sulfate","condition":"bowel preparation","review":"\"I had no problems with Suprep. I took the first dose and it worked about 1.5 hours later. The taste was not bad at all. No nausea, no cramps, just a little bit of dizziness and it made my eyes red. I was completely cleared out for my procedure.\"","rating":8.0,"date":"December 22, 2015","usefulcount":17,"review_length_est":49}


## Load

In [93]:
drug_dataset_clean_loaded = load_dataset("json", data_files=json_data_files)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [94]:
drug_dataset_clean_loaded

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount', 'review_length_est'],
        num_rows: 111122
    })
    validation: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount', 'review_length_est'],
        num_rows: 27781
    })
    test: Dataset({
        features: ['patient_id', 'drugname', 'condition', 'review', 'rating', 'date', 'usefulcount', 'review_length_est'],
        num_rows: 46246
    })
})