<a href="https://colab.research.google.com/github/davidsolow/med-abbrev-mystery/blob/kiara/MeDAL_pre_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-processing of MeDAL Dataset for use in fine-tuning BERT models
### Steps:
1. Import data from Drive, read CSVs, convert to Pandas dataframes
2. Clean location and label columns
3. Select rows with location < max_location and add abbreviation column
4. Convert labels to integers and make label dictionary
5. Mask abbreviations with '[MASK]'


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train = pd.read_csv("drive/MyDrive/266Project/train-3.csv")
test = pd.read_csv("drive/MyDrive/266Project/test.csv")
validation = pd.read_csv("drive/MyDrive/266Project/validation.csv")

In [None]:
#cleaning location and label columns
def clean_location(location):
  """Takes a number in brackets as input and reterns the number as an int"""
  return int(str(location).strip("[]"))

def clean_label(label):
  """Takes a label in brackets and quotes as input and reterns the label as a string"""
  return label.strip("[]'")

for dataset in [train, test, validation]:
  dataset['location'] = dataset['location'].apply(clean_location)
  dataset['label'] = dataset['label'].apply(clean_label)

In [None]:
#filtering by location of abbreviation
max_length = 200
max_location = max_length - 3 # minus [CLS] and [SEP] tokens added and index offset

def add_abbreviation_col(dataset):
    """Adds an abbreviation column to the dataset from the specified location in the text"""
    dataset['abbreviation'] = dataset.apply(lambda row: row['text'].split()[row['location']], axis=1)
    return dataset

def clean_dataset(dataset):
    dataset = dataset.loc[dataset['location'] <= max_location].copy()
    add_abbreviation_col(dataset)
    return dataset

for dataset in [train, test, validation]:
  clean_dataset(dataset)


In [None]:
#converting labels to integers
def make_label_map(labels):
  label_map = {}
  for i in range(len(labels.unique())):
    label_map[labels.unique()[i]] = i
  return label_map

label_map = make_label_map(train['label'])

#making sure test and validation sets don't have any labels that don't appear in train set
validation = validation[validation['label'].isin(valid_labels)]
test = test[test['label'].isin(valid_labels)]

#mapping labels to integers in datasets
for dataset in [train, test, validation]:
  dataset['label'] = dataset['label'].map(label_map)

In [None]:
#masking abbreviations
def mask_abbreviations(row):
  """Takes row as an input and transforms the text column to have [MASK] in place of abbreviations"""
  text = row['text'].lower().split()
  location = row['location']
  mask = '[MASK]'
  text[location] = mask
  return ' '.join(text)

#applying function to datasets
train['text'] = train.apply(mask_abbreviations, axis=1)
validation['text'] = validation.apply(mask_abbreviations, axis=1)
test['text'] = test.apply(mask_abbreviations, axis=1)