In [1]:
import os
from pathlib import Path
data_dir = "pubmed-rct\PubMed_20k_RCT_numbers_replaced_with_at_sign"
os.listdir(data_dir)
filenames = [data_dir + "\\" + filename for filename in os.listdir(data_dir)]
filenames

['pubmed-rct\\PubMed_20k_RCT_numbers_replaced_with_at_sign\\dev.txt',
 'pubmed-rct\\PubMed_20k_RCT_numbers_replaced_with_at_sign\\test.txt',
 'pubmed-rct\\PubMed_20k_RCT_numbers_replaced_with_at_sign\\train.txt']

In [2]:
def preprocessor(data_address):
    with open (data_address) as f:
        text = f.readlines()
    preprocessed_data = []
    segment = []  # Temporary storage for the current segment
    total_lines = 0  # Total lines in the current segment

    for line in text:
        if line.startswith("###"):  # Ignore metadata lines
            continue
        elif line.startswith("\n"):  # End of a segment
            # Add total line count to all lines in the segment
            for entry in segment:
                entry["total_lines"] = total_lines
                preprocessed_data.append(entry)
            # Reset for the next segment
            segment = []
            total_lines = 0
            continue
        else:
            total_lines += 1
            line_parts = line.split("\t")
            segment.append({
                "line_number": total_lines,
                "target": line_parts[0],
                "text": line_parts[1],
            })
    return preprocessed_data

In [3]:
import pandas as pd
val_df = pd.DataFrame(preprocessor('pubmed-rct\\PubMed_20k_RCT\\dev.txt'))
train_df = pd.DataFrame(preprocessor('pubmed-rct\\PubMed_20k_RCT\\train.txt'))
test_df = pd.DataFrame(preprocessor('pubmed-rct\\PubMed_20k_RCT\\test.txt'))

In [4]:
train_df.head(5)

Unnamed: 0,line_number,target,text,total_lines
0,1,OBJECTIVE,To investigate the efficacy of 6 weeks of dail...,12
1,2,METHODS,A total of 125 patients with primary knee OA w...,12
2,3,METHODS,Outcome measures included pain reduction and i...,12
3,4,METHODS,Pain was assessed using the visual analog pain...,12
4,5,METHODS,Secondary outcome measures included the Wester...,12


In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_df["target"] = le.fit_transform(train_df["target"])
test_df["target"] = le.fit_transform(test_df["target"])
val_df["target"] = le.fit_transform(val_df["target"])
from datasets import Dataset
train_df = Dataset.from_pandas(train_df)

test_df = Dataset.from_pandas(test_df)

val_df = Dataset.from_pandas(val_df)


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
train_df_5000 = train_df.take(5000)

In [7]:
train_df_5000

Dataset({
    features: ['line_number', 'target', 'text', 'total_lines'],
    num_rows: 5000
})

In [8]:
val_df_5000 = val_df.take(5000)
test_df_5000 = test_df.take(5000)

In [9]:
from transformers import AutoTokenizer, DataCollatorWithPadding
import numpy as np
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint,max_length=512)


In [10]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True,padding=True)


tokenized_datasets = train_df_5000.map(tokenize_function, batched=True,remove_columns=['line_number','text','total_lines'])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")




Map: 100%|██████████| 5000/5000 [00:01<00:00, 4500.50 examples/s]


In [11]:
tokenized_datasets_validation = val_df_5000.map(tokenize_function, batched=True,remove_columns=['line_number','text','total_lines'])

Map: 100%|██████████| 5000/5000 [00:01<00:00, 4329.18 examples/s]


In [13]:
tokenized_datasets

Dataset({
    features: ['target', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5000
})

In [14]:
print(tokenized_datasets.features)
print(tokenized_datasets_validation.features)


{'target': Value(dtype='int32', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}
{'target': Value(dtype='int32', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [15]:
tf_train_dataset = tokenized_datasets.to_tf_dataset(
    columns=["input_ids", "attention_mask", 'token_type_ids'],
    label_cols=["target"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_datasets_validation.to_tf_dataset(
    columns=["input_ids", "attention_mask", 'token_type_ids'],
    label_cols=["target"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)  


Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [16]:
from transformers import TFAutoModelForSequenceClassification,TFAutoModel

model = TFAutoModel.from_pretrained(checkpoint, num_labels=5)



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassi

In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=5,
)

Epoch 1/5


AttributeError: in user code:

    File "e:\My Projects\NLP_disaster\env\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "e:\My Projects\NLP_disaster\env\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "e:\My Projects\NLP_disaster\env\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "e:\My Projects\NLP_disaster\env\lib\site-packages\transformers\modeling_tf_utils.py", line 1630, in train_step
        x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data)

    AttributeError: module 'keras.utils' has no attribute 'unpack_x_y_sample_weight'
