In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'medical-text:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4941176%2F8318937%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240506%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240506T153315Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D76781938fe31eed80ea65b95c4b0bc5c050d25771eddcb9a826a33da86984878f4db594f760515932e6291f4f781e9e788f124f9c973935c6ee23f793482d0b38fc4bf123ee2f03f5dc0427f572dff7f7c84c0ced6f72bf092e2f51dee08fca9a8eeede001596bb5fab98ac7de5f5bced29ce1a78beaf07e5746b6d7db4581d284e625045a4010c83cc4cd077debaaedcfa805164a3fffa5fc848173523c62174407aa952590cf59b1658de3ef6fca783ddde9e1294add8b03bcf295493aea053d403553d1ed7380ccfb75904ec29c8e47fa9fbb6e7ac9a355732a2a952479a13c249f105da5458df37094ae193c60fc34e3ffb3f2c0e81402c35bb00aa9528b'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import os
from datasets import load_dataset

# Specify the path to your dataset
dataset_path = '/kaggle/input/medical-text/data.csv'

# Check if the file exists
if os.path.exists(dataset_path):
    print("Dataset found.")
else:
    print("Dataset not found. Check the file path.")

    from datasets import load_dataset

# Load the dataset
dataset = load_dataset('csv', data_files=dataset_path)['train']

Dataset found.


Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
from datasets import load_dataset, DatasetDict

# Load the tokenizer and model for BERT (you can switch this to DistilBERT or any other model)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = TFAutoModel.from_pretrained('bert-base-uncased')

# Split the dataset into train, validation, and test sets
train_test_split = dataset.train_test_split(test_size=0.1)
test_val_split = train_test_split['test'].train_test_split(test_size=0.5)
split_dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': test_val_split['test'],
    'test': test_val_split['train']
})

# Tokenization and converting to TensorFlow format
def tokenize_and_format(batch):
    tokenized_batch = tokenizer(batch['medical_abstract'], padding='max_length', truncation=True, max_length=512)
    tokenized_batch = {k: tf.convert_to_tensor(v) for k, v in tokenized_batch.items()}
    tokenized_batch['labels'] = tf.convert_to_tensor(batch['condition_label'])
    return tokenized_batch

split_dataset = split_dataset.map(tokenize_and_format, batched=True)
split_dataset.set_format(type='tensorflow', columns=['input_ids', 'attention_mask', 'labels'])

# Function to prepare inputs for BERT
def map_example_to_dict(input_ids, attention_mask, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
    }, labels

# Prepare TensorFlow datasets
batch_size = 16
train_dataset = tf.data.Dataset.from_tensor_slices((split_dataset['train']['input_ids'],
                                                    split_dataset['train']['attention_mask'],
                                                    split_dataset['train']['labels']))
train_dataset = train_dataset.map(map_example_to_dict).shuffle(1000).batch(batch_size)

validation_dataset = tf.data.Dataset.from_tensor_slices((split_dataset['validation']['input_ids'],
                                                         split_dataset['validation']['attention_mask'],
                                                         split_dataset['validation']['labels']))
validation_dataset = validation_dataset.map(map_example_to_dict).batch(batch_size)

test_dataset = tf.data.Dataset.from_tensor_slices((split_dataset['test']['input_ids'],
                                                   split_dataset['test']['attention_mask'],
                                                   split_dataset['test']['labels']))
test_dataset = test_dataset.map(map_example_to_dict).batch(batch_size)

# Define the TensorFlow model
class BERTForClassification(tf.keras.Model):
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        # In case of using BERT, `bert_model.pooler_output` might be used or use `bert_model.last_hidden_state`
        self.classifier = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, inputs):
        outputs = self.bert(inputs)
        # Use pooled output for classification tasks
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)

# Initialize the model
num_classes = 6  # Adjust based on the number of condition labels
classifier = BERTForClassification(model, num_classes=num_classes)

# Compile the model
classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

# Fit the model
history = classifier.fit(train_dataset, epochs=3, validation_data=validation_dataset)

# Evaluate the model
test_results = classifier.evaluate(test_dataset)
print(f'Test results - Loss: {test_results[0]} - Accuracy: {test_results[1]*100}%')


2024-05-06 14:49:55.921088: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-06 14:49:55.921207: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-06 14:49:56.051861: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Map:   0%|          | 0/10395 [00:00<?, ? examples/s]

Map:   0%|          | 0/578 [00:00<?, ? examples/s]

Map:   0%|          | 0/577 [00:00<?, ? examples/s]

Epoch 1/3


I0000 00:00:1715007101.081807     116 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/3
Epoch 3/3
Test results - Loss: 0.8626721501350403 - Accuracy: 59.965336322784424%
