# Data Loading
### Access all dataset here: 
https://imperiallondon-my.sharepoint.com/:f:/g/personal/dlc19_ic_ac_uk/EgobSIgJFitCuMdL0Sg6KmABP7qqtibuOz1R1jIZDEX22Q?e=U5CfiK

In [82]:
##################
# All imports 
##################

#sys libs
import os
import sys
import random
import warnings
warnings.filterwarnings("ignore")

#data manupulation libs
import pandas as pd
import numpy as np
#from sklearn.model_selection import train_test_split

#from pandarallel import pandarallel

# Initialization
#pandarallel.initialize()

#string manupulation libs
import re
import string
from string import digits
import spacy
import nltk
from nltk.corpus import stopwords 
nltk.download('stopwords')

#torch libs
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

import transformers

from transformers import BertTokenizer, AutoTokenizer, AutoModel
from datasets import load_dataset

# data manipulations
from pathlib import Path
import uuid
import pydicom

from PIL import Image

import cv2
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     /media/SharedUsers/mj719/home/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Preprocessing - done separately from data loader

In [57]:
df_train_temp = pd.read_csv('../../train_raw_reports.csv')
df_test_temp = pd.read_csv('../../test_raw_reports.csv')

#convetring study id to string as it doesn't work as an int
df_train_temp['study_id']=df_train_temp['study_id'].astype(str)
df_test_temp['study_id']=df_test_temp['study_id'].astype(str)

In [58]:
# display raw reports in a dataframe
df_train_temp.head()

Unnamed: 0,study_id,raw_report
0,50414267,FINAL REPORT\...
1,53189527,FINAL REPORT\...
2,53911762,FINAL REPORT\...
3,56699142,FINAL REPORT\...
4,57375967,FINAL REPORT\...


In [59]:
# preprocessing of the train dataset
def preprocessing(text):
        cleanedReport = re.sub(r'[^\w\s]','',text)            # remove punctuation (not word characters and whitespace)
        cleanedReport = re.sub('_', '', cleanedReport)        # remove __ in the report
        cleanedReport = re.sub(r'[\d-]', '', cleanedReport)   # remove numbers in the report 
        cleanedReport = re.sub('\n', '', cleanedReport)

        return cleanedReport   
    

def preprocessDataframe(df):
    
    i = 0
    
    for i in range(len(df)):
        
        preprocessedText = preprocessing(df.at[i, "raw_report"])
    
        df.at[i,'raw_report'] = preprocessedText
        i = i + 1 
    return df

In [60]:
# preprocess the raw reports in the dataframe 
df_train_preprocessed = preprocessDataframe(df_train_temp)
df_test_preprocessed = preprocessDataframe(df_test_temp)

In [61]:
df_train_preprocessed.head() # check if that worked

Unnamed: 0,study_id,raw_report
0,50414267,FINAL REPORT ...
1,53189527,FINAL REPORT ...
2,53911762,FINAL REPORT ...
3,56699142,FINAL REPORT ...
4,57375967,FINAL REPORT ...


In [62]:
df_train_preprocessed.to_csv('train_preprocessed.csv', index=False)
df_test_preprocessed.to_csv('test_preprocessed.csv', index=False)

### Data Loader
- Include: loading data, text preprocessing, words frequency check, tokenization, tokens-IDs-conversion 

In [86]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
stop_words = set(stopwords.words('english'))
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [64]:

data_files = {"train": "train_preprocessed.csv", "test": "test_preprocessed.csv"}
reports_dataset = load_dataset("csv", data_files=data_files)

reports_dataset

Using custom data configuration default-999a6f78add30544


Downloading and preparing dataset csv/default to /media/SharedUsers/mj719/home/.cache/huggingface/datasets/csv/default-999a6f78add30544/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /media/SharedUsers/mj719/home/.cache/huggingface/datasets/csv/default-999a6f78add30544/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['study_id', 'raw_report'],
        num_rows: 222337
    })
    test: Dataset({
        features: ['study_id', 'raw_report'],
        num_rows: 3269
    })
})

In [101]:
#example of how to acess one report as a dataset
reports_dataset["train"]
#example of how to acess only written report 
#reports_dataset["train"]['raw_report'][0]

Dataset({
    features: ['study_id', 'raw_report'],
    num_rows: 222337
})

In [67]:
def tokenize_function(dataset):
    result = tokenizer(dataset["raw_report"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = reports_dataset.map(
    tokenize_function, batched=True, remove_columns=["raw_report", "study_id"]
)


chunk_size = 128

def group_texts(dataset):
    # Concatenate all texts
    concatenated_text = {k: sum(dataset[k], []) for k in dataset.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_text[list(dataset.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_text.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result


lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Loading cached processed dataset at /media/SharedUsers/mj719/home/.cache/huggingface/datasets/csv/default-999a6f78add30544/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-7e822d87f42e973e.arrow
Loading cached processed dataset at /media/SharedUsers/mj719/home/.cache/huggingface/datasets/csv/default-999a6f78add30544/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-98005d92f143e4a9.arrow


  0%|          | 0/223 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 206331
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 3345
    })
})

In [70]:
for split, dataset in lm_datasets.items():
    dataset.to_csv(f"train_dataset_{split}.csv", index=None)

Creating CSV from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

512

In [102]:
def tokenize_function_padding(dataset):
    result = tokenizer(dataset["raw_report"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    
    return result

# Use batched=True to activate fast multithreading!
tokenized_datasets_padded = reports_dataset.map(
    tokenize_function_padding, batched=True, remove_columns=["raw_report", "study_id"])

        

Loading cached processed dataset at /media/SharedUsers/mj719/home/.cache/huggingface/datasets/csv/default-999a6f78add30544/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-4df61c33141bb878.arrow
Loading cached processed dataset at /media/SharedUsers/mj719/home/.cache/huggingface/datasets/csv/default-999a6f78add30544/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-986483f35e2862d4.arrow
