# Data Cleaning: Slicing and Dicing Data

Link: https://huggingface.co/learn/llm-course/chapter5/3

## Slicing and dicing data

In [1]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

--2025-12-28 18:13:23--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip’

drugsCom_raw.zip        [          <=>       ]  41.00M  2.39MB/s    in 29s     

2025-12-28 18:13:54 (1.41 MB/s) - ‘drugsCom_raw.zip’ saved [42989872]

Archive:  drugsCom_raw.zip
  inflating: drugsComTest_raw.tsv    
  inflating: drugsComTrain_raw.tsv   


In [10]:
from datasets import load_dataset

data_files = {
    'train': "drugsComTrain_raw.tsv",
     "test": "drugsComTest_raw.tsv"
}

drug_dataset = load_dataset(
    'csv',
    data_files=data_files,
    delimiter='\t'
)

In [3]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [11]:
# creating a random sample
drug_sample = drug_dataset['train'].shuffle(seed=42).select(range(1000))

drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [6]:
drug_dataset.keys()

dict_keys(['train', 'test'])

In [7]:
drug_dataset['train'].unique('Unnamed: 0')

[206461,
 95260,
 92703,
 138000,
 35696,
 155963,
 165907,
 102654,
 74811,
 48928,
 29607,
 75612,
 191290,
 221320,
 98494,
 81890,
 48188,
 219869,
 212077,
 119705,
 12372,
 231466,
 227020,
 41928,
 213649,
 51215,
 206180,
 78563,
 132258,
 27339,
 51452,
 96233,
 204999,
 214453,
 71188,
 80520,
 125343,
 93678,
 60678,
 206444,
 221934,
 39795,
 173398,
 12056,
 121333,
 111409,
 111474,
 188061,
 146502,
 153093,
 156544,
 135645,
 69629,
 96906,
 215018,
 102449,
 60455,
 87285,
 225508,
 106703,
 131704,
 192806,
 69488,
 107449,
 60156,
 88659,
 24139,
 131909,
 202903,
 85162,
 12559,
 83734,
 168378,
 172031,
 144224,
 109866,
 216434,
 9116,
 111410,
 64089,
 217014,
 171349,
 60050,
 131041,
 17083,
 186984,
 202401,
 189138,
 156143,
 56141,
 43085,
 187720,
 7337,
 79467,
 147133,
 45237,
 102810,
 60280,
 10677,
 196244,
 19966,
 229524,
 180062,
 126595,
 208641,
 220696,
 122541,
 163567,
 57623,
 137538,
 156303,
 34093,
 116910,
 166891,
 160750,
 65646,
 61002,

In [8]:
# check for uniqueness of patient ID: 'Unnamed: 0' column
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [12]:
# renaming columns
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0",
    new_column_name='patient_id'
)

drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [13]:
len(drug_dataset['train'].unique('drugName')), len(drug_dataset['train'].unique('condition')) 

(3436, 885)

In [14]:
# normalising the casing of condition column 
def lowercase_condition(example):
    return {
        'condition': example['condition'].lower()
    }

drug_dataset.map(lowercase_condition)

Map:   0%|          | 0/161297 [00:00<?, ? examples/s]

AttributeError: 'NoneType' object has no attribute 'lower'

Column condition has None value. This is not interpreted as String type, hence it throws an error.

In [15]:
# removing rows containing condition = None
def filter_nones(x):
    return x['condition'] is not None

In [16]:
(lambda x: x+x)(4)

8

In [17]:
(lambda x,y: x*y)(5,3)

15

In [18]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [19]:
drug_dataset = drug_dataset.filter(lambda x: x['condition'] is not None)
drug_dataset

Filter:   0%|          | 0/161297 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53766 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})

In [20]:
# normalising the casing of condition column 
drug_dataset.map(lowercase_condition)

drug_dataset['train']['condition'][:5]

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

['Left Ventricular Dysfunction',
 'ADHD',
 'Birth Control',
 'Birth Control',
 'Opiate Dependence']

## Adding a New Column

In [21]:
# working with text data such as review, add an additional column stating length of corresponding text
def compute_review_length(example):
    return {
        'review_length': len(example['review'].split())
    }

In [22]:
drug_dataset = drug_dataset.map(compute_review_length)

drug_dataset['train'][0]

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'Left Ventricular Dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [23]:
# sorting according to review length
drug_dataset['train'].sort('review_length')[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['Hepatitis C', 'ADHD', 'Birth Control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [24]:
drug_dataset['train'].sort('review_length')[-3:]

{'patient_id': [216072, 181160, 121004],
 'drugName': ['Copper', 'Prozac', 'Venlafaxine'],
 'condition': ['Birth Control', 'Obsessive Compulsive Disorde', 'Migraine'],
 'review': ['"My Complicated experience with the insertion of the copper IUD. It was &quot;one of the most difficult &amp; Complicated IUD insertions I&#039;ve had in a very long time&quot; quoting the words of my Gynecologist MD. Now I have not been sexually active for over a year and a half (by choice) &amp; I&#039;ve never had kids so that for one was a concern for my doctor since she said i might be very tight &amp; feel pain. Anywhom I am 23 and recently decided to date again &amp; wanted to have a convinient birth control that won&#039;t affect my weight or cause acne since acne has recently been a new battle for me, so my first step was hitting the Internet for options. After reading many reviews for multiple birth controls I decided on the copper para guard which is for 10-12 years. Now I went to my doctor and ha

In [25]:
# filtering reviews that are less than 30 words long
drug_dataset = drug_dataset.filter(lambda x: x['review_length'] > 30)
drug_dataset.num_rows

Filter:   0%|          | 0/160398 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53471 [00:00<?, ? examples/s]

{'train': 138514, 'test': 46108}

In [26]:
drug_dataset['train'].sort('review_length')[:3]

{'patient_id': [208641, 118552, 2448],
 'drugName': ['Amlodipine / olmesartan',
  'Amoxicillin / clarithromycin / lansoprazole',
  'Emend'],
 'condition': ['High Blood Pressure',
  'Helicobacter Pylori Infection',
  'Nausea/Vomiting, Postoperative'],
 'review': ['"My blood pressure has been around 160/100. Doctor prescribed Azor 40/10. Just 4 hrs later my reading showed 120/82. I was amazed. I am now on it daily. Thanks to Azor."',
  '"I had severe vomiting and diarhoea for 3 days caused by clarythromycin. After being treated for dehydration at the hospital, clarythormycin was replaced with doxycycline, and I have no problems since."',
  '"I always get nausea and vomiting with anesthesia even when taking other anti-nausea meds. Was given Emend prior to gallbladder removal. Woke up with absolutely no nausea. Worked great for me!"'],
 'rating': [10.0, 2.0, 10.0],
 'date': ['January 19, 2015', 'February 18, 2017', 'August 3, 2016'],
 'usefulCount': [10, 4, 1],
 'review_length': [31, 31, 3

In [27]:
# dealing with html character codes
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [28]:
drug_dataset = drug_dataset.map(lambda x: {
    'review': html.unescape(x['review'])
})

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]