In [1]:
# !wget -P ../data/drugs-com "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
# !unzip ../data/drugs-com/drugsCom_raw.zip -d ../data/drugs-com

In [32]:
import html
import multiprocessing
from datasets import load_dataset
from transformers import AutoTokenizer

In [13]:
num_cores = multiprocessing.cpu_count()
num_cores_avail = max(1, num_cores - 1)

# Load dataset

In [4]:
data_dir_path = "../data/drugs-com"

In [15]:
data_files = {"train": f"{data_dir_path}/drugsComTrain_raw.tsv", "test": f"{data_dir_path}/drugsComTest_raw.tsv"}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [16]:
drug_dataset["train"]

Dataset({
    features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
    num_rows: 161297
})

## Take a random sample to get a feel for the data

In [17]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

## Quality checks and data cleaning

In [18]:
# Is Unnamed: 0 a unique identifier?
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [19]:
drug_dataset = drug_dataset.rename_column("Unnamed: 0", new_column_name="patient_id")

In [20]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [21]:
len(drug_dataset["train"].unique("drugName"))

3436

## Clean labels

In [22]:
def lowercase_condition(example):
    return {"condition": [x.lower() for x in example["condition"]]}

def filter_nones(example):
    return [x is not None for x in example["condition"]]

drug_dataset = (
    drug_dataset
        .filter(filter_nones, batched=True, num_proc=num_cores_avail)
        .map(lowercase_condition, batched=True, num_proc=num_cores_avail)
)

Filter (num_proc=15):   0%|          | 0/161297 [00:00<?, ? examples/s]

Filter (num_proc=15):   0%|          | 0/53766 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/160398 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/53471 [00:00<?, ? examples/s]

In [23]:
drug_dataset["train"]["condition"][:3]

['left ventricular dysfunction', 'adhd', 'birth control']

## Clean data

### Get rough estimate of review lengths

In [24]:
drug_dataset = drug_dataset.map(
    lambda x: {"review_length_est": [len(r.split(' ')) for r in x["review"]]},
    batched=True,
    num_proc=num_cores_avail
)

Map (num_proc=15):   0%|          | 0/160398 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/53471 [00:00<?, ? examples/s]

In [25]:
drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length_est': 17}

In [26]:
drug_dataset["train"].sort("review_length_est")[:3]["review_length_est"]

[1, 1, 1]

In [27]:
drug_dataset["train"].sort("review_length_est", reverse=True)[:3]["review_length_est"]

[1857, 1222, 1107]

In [28]:
drug_dataset = drug_dataset.filter(
    lambda x: [rle > 30 for rle in x["review_length_est"]],
    batched=True,
    num_proc=num_cores_avail
)

Filter (num_proc=15):   0%|          | 0/160398 [00:00<?, ? examples/s]

Filter (num_proc=15):   0%|          | 0/53471 [00:00<?, ? examples/s]

In [29]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length_est'],
        num_rows: 138903
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length_est'],
        num_rows: 46246
    })
})

### Fix HTML character codes

In [30]:
text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [31]:
# drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})
drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(r) for r in x["review"]]},
    batched=True,
    num_proc=num_cores_avail
)

Map (num_proc=15):   0%|          | 0/138903 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/46246 [00:00<?, ? examples/s]

# Tokenize

In [33]:
checkpoint = "bert-base-cased"

In [42]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
slow_tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=False)

In [40]:
def tokenize_function(tokenizer, examples):
    return tokenizer(examples["review"], truncation=True)

## Single-processing

In [41]:
tokenized_dataset = drug_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=True)

Map:   0%|          | 0/138903 [00:00<?, ? examples/s]

Map:   0%|          | 0/46246 [00:00<?, ? examples/s]

In [43]:
tokenized_dataset = drug_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=False)

Map:   0%|          | 0/138903 [00:00<?, ? examples/s]

Map:   0%|          | 0/46246 [00:00<?, ? examples/s]

## Multi-processing

In [44]:
tokenized_dataset = drug_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=True, num_proc=num_cores_avail)

Map (num_proc=15):   0%|          | 0/138903 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/46246 [00:00<?, ? examples/s]

In [45]:
tokenized_dataset = drug_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=False, num_proc=num_cores_avail)

Map (num_proc=15):   0%|          | 0/138903 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/46246 [00:00<?, ? examples/s]

# Try using slow tokenizer
Don't bother doing it single-threaded, since it would take forever

In [46]:
tokenized_dataset = drug_dataset.map(lambda x: tokenize_function(slow_tokenizer, x), batched=True, num_proc=num_cores_avail)

Map (num_proc=15):   0%|          | 0/138903 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/46246 [00:00<?, ? examples/s]

# Use Tokenizer

In [47]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )

Multiple "features" for this example since there's overflow (sequences longer than max length)

In [49]:
res = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in res.input_ids]

[128, 49]

## Handle mismatched length through removal

In [51]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split,
    batched=True,
    num_proc=num_cores_avail,
    remove_columns=drug_dataset["train"].column_names
)

Map (num_proc=15):   0%|          | 0/138903 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/46246 [00:00<?, ? examples/s]

In [52]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 207161
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 69014
    })
})

Compare dataset lengths -- the dataset that allows for overflow should be longer (examples split up into multiple examples)

In [53]:
len(tokenized_dataset["train"]), len(drug_dataset["train"])

(207161, 138903)

## Handle mismatched length through repetition

In [56]:
def tokenize_and_split_rep(examples):
    res = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )
    # Extract mapping between new and old indices
    sample_map = res.pop("overflow_to_sample_mapping")
    # Duplicate entries for individuals
    for key, values in examples.items():
        res[key] = [values[i] for i in sample_map]
    return res

In [58]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split_rep,
    batched=True,
    num_proc=num_cores_avail
)

Map (num_proc=15):   0%|          | 0/138903 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/46246 [00:00<?, ? examples/s]

In [59]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length_est', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 207161
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length_est', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 69014
    })
})

In [67]:
print(tokenized_dataset["train"][0]["patient_id"])
print(tokenized_dataset["train"][1]["patient_id"])

95260
95260


In [69]:
print(len(tokenized_dataset["train"][0]["input_ids"]))
print(len(tokenized_dataset["train"][1]["input_ids"]))

128
49
