In [2]:
from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
drug_dataset

In [2]:
#在进行任何类型的数据分析时，一个好的做法是抽取一个小的随机样本，以快速了解您正在处理的数据类型
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
# Peek at the first few examples
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [4]:
#把 Unnamed: 0 列重命名为患者的id
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [8]:
drug_dataset["train"][0]["condition"].lower()

'left ventricular dysfunction'

In [6]:
#使用 Dataset.map()标准化所有 condition 标签
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

# 以下会出错，因为 condition 列存在 None , 不能转换为小写
drug_dataset.map(lowercase_condition)

  0%|          | 0/161297 [00:00<?, ?ex/s]

AttributeError: 'NoneType' object has no attribute 'lower'

In [10]:
# 需要把condition为None的过滤掉
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None) 

  0%|          | 0/162 [00:00<?, ?ba/s]

  0%|          | 0/54 [00:00<?, ?ba/s]

In [11]:
drug_dataset = drug_dataset.map(lowercase_condition)
# Check that lowercasing worked
drug_dataset["train"]["condition"][:3]

  0%|          | 0/160398 [00:00<?, ?ex/s]

  0%|          | 0/53471 [00:00<?, ?ex/s]

['left ventricular dysfunction', 'adhd', 'birth control']

In [12]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

drug_dataset = drug_dataset.map(compute_review_length)
# Inspect the first training example
drug_dataset["train"][0]

  0%|          | 0/160398 [00:00<?, ?ex/s]

  0%|          | 0/53471 [00:00<?, ?ex/s]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [13]:
#使用 Dataset.sort()对这个新列进行排序，然后查看极端长度的评论的样子
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [103488, 23627, 20558],
 'drugName': ['Loestrin 21 1 / 20', 'Chlorzoxazone', 'Nucynta'],
 'condition': ['birth control', 'muscle spasm', 'pain'],
 'review': ['"Excellent."', '"useless"', '"ok"'],
 'rating': [10.0, 1.0, 6.0],
 'date': ['November 4, 2008', 'March 24, 2017', 'August 20, 2016'],
 'usefulCount': [5, 2, 10],
 'review_length': [1, 1, 1]}

In [14]:
# 使用 Dataset.filter() 功能来删除包含少于 30 个单词的评论
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

  0%|          | 0/161 [00:00<?, ?ba/s]

  0%|          | 0/54 [00:00<?, ?ba/s]

{'train': 138514, 'test': 46108}


In [17]:
#使用 Dataset.map() 对我们语料库中的所有 HTML 字符进行转义
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

  0%|          | 0/138514 [00:00<?, ?ex/s]

  0%|          | 0/46108 [00:00<?, ?ex/s]

In [18]:
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [63851, 110702, 125327],
 'drugName': ['Sertraline', 'Duragesic', 'Dulcolax'],
 'condition': ['depression', 'pain', 'constipation'],
 'review': ['"I\'ve been on Serdep 50mg for nearly 3 weeks and feeling so much better. I\'ve been on Lorien before, but didn\'t help. My emotional status and irritability is so much better."',
  '"This patch has been the most effective in treating my spinal pain, however due to a change in work and traveling extensively the restrictions about getting a refill is hard. "',
  '"I had to take triple the amount to get any relief and it made me violently ill for two days.  It took three days before I could eat solid food again!"'],
 'rating': [9.0, 9.0, 10.0],
 'date': ['September 3, 2017', 'April 3, 2009', 'October 21, 2015'],
 'usefulCount': [21, 39, 9],
 'review_length': [31, 31, 31]}