# Data Loading
### Access all dataset here: 
https://imperiallondon-my.sharepoint.com/:f:/g/personal/dlc19_ic_ac_uk/EgobSIgJFitCuMdL0Sg6KmABP7qqtibuOz1R1jIZDEX22Q?e=U5CfiK

In [5]:
##################
# All imports 
##################

#sys libs
import os
import sys
import random
import warnings
warnings.filterwarnings("ignore")

#data manupulation libs
import pandas as pd
import numpy as np
#from sklearn.model_selection import train_test_split

#from pandarallel import pandarallel

# Initialization
#pandarallel.initialize()

#string manupulation libs
import re
import string
from string import digits
import spacy
import nltk
from nltk.corpus import stopwords 
nltk.download('stopwords')

#torch libs
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

import transformers

from transformers import BertTokenizer, AutoTokenizer, AutoModel
from datasets import load_dataset

# data manipulations
from pathlib import Path
import uuid
import pydicom

from PIL import Image

import cv2
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     /media/SharedUsers/mj719/home/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Preprocessing - done separately from data loader

In [6]:
df_train_temp = pd.read_csv('../../train_raw_reports.csv')
df_test_temp = pd.read_csv('../../test_raw_reports.csv')

#convetring study id to string as it doesn't work as an int
df_train_temp['study_id']=df_train_temp['study_id'].astype(str)
df_test_temp['study_id']=df_test_temp['study_id'].astype(str)

In [7]:
# display raw reports in a dataframe
df_train_temp.head()

Unnamed: 0,study_id,raw_report
0,50414267,FINAL REPORT\...
1,53189527,FINAL REPORT\...
2,53911762,FINAL REPORT\...
3,56699142,FINAL REPORT\...
4,57375967,FINAL REPORT\...


In [8]:
# preprocessing of the train dataset
def preprocessing(text):
        cleanedReport = re.sub(r'[^\w\s]','',text)            # remove punctuation (not word characters and whitespace)
        cleanedReport = re.sub('_', '', cleanedReport)        # remove __ in the report
        cleanedReport = re.sub(r'[\d-]', '', cleanedReport)   # remove numbers in the report 
        cleanedReport = re.sub('\n', '', cleanedReport)

        return cleanedReport   
    

def preprocessDataframe(df):
    
    i = 0
    
    for i in range(len(df)):
        
        preprocessedText = preprocessing(df.at[i, "raw_report"])
    
        df.at[i,'raw_report'] = preprocessedText
        i = i + 1 
    return df

In [9]:
# preprocess the raw reports in the dataframe 
df_train_preprocessed = preprocessDataframe(df_train_temp)
df_test_preprocessed = preprocessDataframe(df_test_temp)

In [11]:
df_train_preprocessed.head() # check if that worked

Unnamed: 0,study_id,raw_report
0,50414267,FINAL REPORT ...
1,53189527,FINAL REPORT ...
2,53911762,FINAL REPORT ...
3,56699142,FINAL REPORT ...
4,57375967,FINAL REPORT ...


In [12]:
df_train_preprocessed.to_csv('train_preprocessed.csv', index=False)
df_test_preprocessed.to_csv('test_preprocessed.csv', index=False)

### Data Loader
- Include: loading data, text preprocessing, words frequency check, tokenization, tokens-IDs-conversion 

In [13]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
stop_words = set(stopwords.words('english'))

In [14]:

data_files = {"train": "train_preprocessed.csv", "test": "test_preprocessed.csv"}
reports_dataset = load_dataset("csv", data_files=data_files)

reports_dataset

Using custom data configuration default-e9caa07d38af213e


Downloading and preparing dataset csv/default to /media/SharedUsers/mj719/home/.cache/huggingface/datasets/csv/default-e9caa07d38af213e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /media/SharedUsers/mj719/home/.cache/huggingface/datasets/csv/default-e9caa07d38af213e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['study_id', 'raw_report'],
        num_rows: 222337
    })
    test: Dataset({
        features: ['study_id', 'raw_report'],
        num_rows: 3269
    })
})

In [15]:
#example of how to acess one report as a dataset
reports_dataset["train"]
#example of how to acess only written report 
#reports_dataset["train"]['raw_report'][0]

Dataset({
    features: ['study_id', 'raw_report'],
    num_rows: 222337
})

In [16]:
def tokenize_function(dataset):
    result = tokenizer(dataset["raw_report"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = reports_dataset.map(
    tokenize_function, batched=True, remove_columns=["raw_report", "study_id"]
)


chunk_size = 128

def group_texts(dataset):
    # Concatenate all texts
    concatenated_text = {k: sum(dataset[k], []) for k in dataset.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_text[list(dataset.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_text.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result


lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

  0%|          | 0/223 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/223 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 206331
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 3345
    })
})

In [17]:
for split, dataset in lm_datasets.items():
    dataset.to_json(f"final_dataset_chunked{split}.jsonl")

Creating json from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [21]:
def tokenize_function_padding(dataset):
    result = tokenizer(dataset["raw_report"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        
    return result

# Use batched=True to activate fast multithreading!
tokenized_datasets = reports_dataset.map(
    tokenize_function_padding, batched=True, remove_columns=["raw_report", "study_id"])

tokenized_datasets



  0%|          | 0/223 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 222337
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 3269
    })
})

In [22]:
tokenized_datasets["train"]["input_ids"][0]

[101,
 1509,
 2592,
 8179,
 2229,
 185,
 1161,
 1105,
 2495,
 1204,
 12754,
 175,
 1114,
 1207,
 15415,
 1112,
 14375,
 1116,
 174,
 7501,
 1111,
 8974,
 5531,
 2229,
 185,
 1161,
 1105,
 11937,
 7577,
 3839,
 9505,
 1175,
 1110,
 1185,
 17811,
 20994,
 185,
 1513,
 12602,
 174,
 3101,
 17268,
 1137,
 185,
 1673,
 1818,
 12858,
 25632,
 20557,
 6873,
 5552,
 11769,
 7409,
 4233,
 1115,
 1211,
 2620,
 4248,
 15070,
 6719,
 1103,
 3621,
 2660,
 16418,
 2050,
 14196,
 27316,
 1110,
 2999,
 16973,
 1933,
 1166,
 1103,
 1286,
 13093,
 9046,
 1439,
 1103,
 7209,
 1103,
 3077,
 1181,
 3105,
 14701,
 1110,
 8362,
 16996,
 23822,
 1895,
 13306,
 19353,
 24211,
 1785,
 1104,
 1103,
 16530,
 1286,
 3971,
 1105,
 5001,
 10346,
 1132,
 2382,
 8351,
 1185,
 12104,
 3621,
 2660,
 16091,
 13505,
 7637,
 1616,
 1965,
 102]

In [23]:

def padding(dataset):
    
    #for train dataset

    
    num_items = len(dataset['input_ids']) # to get number of all items in train dataset
        
        
    if (len(dataset['input_ids'])) > 300:
        
        while(len(dataset['input_ids']) > 300):
            dataset['input_ids'].pop()
            dataset['token_type_ids'].pop()
            dataset['attention_mask'].pop()
            dataset['word_ids'].pop()

    while(len(dataset['input_ids']) < 301):

        dataset['input_ids'].append(0)
        dataset['token_type_ids'].append(0)
        dataset['attention_mask'].append(0)
        dataset['word_ids'].append(0)
 

    return dataset

padded_dataset = tokenized_datasets.map(padding, batched=False)
    

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [24]:
len(padded_dataset["test"]["input_ids"][10])

301

In [25]:
def check_function(dataset):

    lenReport = len(dataset['input_ids'])
    
    if (lenReport > 301):
        print('yes')
    return dataset

padded_dataset.map(check_function, batched = False)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 222337
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 3269
    })
})

In [26]:
for split, dataset in padded_dataset.items():
    dataset.to_json(f"final_dataset_padded_{split}.jsonl") 

Creating json from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [27]:
data_files = {
    
    "train": "final_dataset_padded_train.jsonl",
    "test": "final_dataset_padded_test.jsonl"
}

reloaded_dataset = load_dataset('json', data_files = data_files)
reloaded_dataset['train']['input_ids'][0]

Using custom data configuration default-a475e9600a321d85


Downloading and preparing dataset json/default to /media/SharedUsers/mj719/home/.cache/huggingface/datasets/json/default-a475e9600a321d85/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /media/SharedUsers/mj719/home/.cache/huggingface/datasets/json/default-a475e9600a321d85/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

[101,
 1509,
 2592,
 8179,
 2229,
 185,
 1161,
 1105,
 2495,
 1204,
 12754,
 175,
 1114,
 1207,
 15415,
 1112,
 14375,
 1116,
 174,
 7501,
 1111,
 8974,
 5531,
 2229,
 185,
 1161,
 1105,
 11937,
 7577,
 3839,
 9505,
 1175,
 1110,
 1185,
 17811,
 20994,
 185,
 1513,
 12602,
 174,
 3101,
 17268,
 1137,
 185,
 1673,
 1818,
 12858,
 25632,
 20557,
 6873,
 5552,
 11769,
 7409,
 4233,
 1115,
 1211,
 2620,
 4248,
 15070,
 6719,
 1103,
 3621,
 2660,
 16418,
 2050,
 14196,
 27316,
 1110,
 2999,
 16973,
 1933,
 1166,
 1103,
 1286,
 13093,
 9046,
 1439,
 1103,
 7209,
 1103,
 3077,
 1181,
 3105,
 14701,
 1110,
 8362,
 16996,
 23822,
 1895,
 13306,
 19353,
 24211,
 1785,
 1104,
 1103,
 16530,
 1286,
 3971,
 1105,
 5001,
 10346,
 1132,
 2382,
 8351,
 1185,
 12104,
 3621,
 2660,
 16091,
 13505,
 7637,
 1616,
 1965,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0