In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
from datasets import list_datasets, load_dataset, ClassLabel
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer

In [None]:
path = "/home/postd/Documents/gitRepos/Technical-Test-Sorcero/Data/RawDataCsvFormat/"

In [None]:
health_facts = load_dataset('csv',data_files={'train':path+'train.csv',
                                              'test':path+'test.csv',
                                              'validation':path+'dev.csv'})
health_facts = health_facts.remove_columns(['claim_id','explanation','main_text','date_published',
                                             'fact_checkers','sources','subjects','labels'])

In [None]:
new_features = health_facts['train'].features.copy()
new_features['label'] = ClassLabel(names=["false","unproven","mixture","true"])
health_facts = health_facts.cast(new_features)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
def encode_hf(dset):
    return tokenizer(dset['claim'], truncation=True, padding='max_length')

In [None]:
hftokenized = health_facts.map(encode_hf,batched=True)

In [None]:
train_dataset_hf= hftokenized["train"]
test_dataset_hf = hftokenized["test"]
#val_dataset_hf = hftokenized["validation"]

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased",num_labels=4)

In [None]:
tf_train_hf = train_dataset_hf.remove_columns(["claim"]).with_format("tensorflow")
tf_test_hf = test_dataset_hf.remove_columns(["claim"]).with_format("tensorflow")
#tf_val_hf = val_dataset_hf.remove_columns(["claim"]).with_format("tensorflow")

In [None]:
train_features = {x: tf_train_hf[x].to_tensor() for x in tokenizer.model_input_names}
train_tf_hf = tf.data.Dataset.from_tensor_slices((train_features, tf_train_hf["label"]))
train_tf_hf = train_tf_hf.shuffle(len(tf_train_hf)).batch(8)

train_dataset_hf_small = train_dataset_hf.shuffle(seed=61).select(range(1000))
tf_train_hf_small = train_dataset_hf_small.remove_columns(["claim"]).with_format("tensorflow")
train_features_sm = {x: tf_train_hf_small[x].to_tensor() for x in tokenizer.model_input_names}
train_tf_hf_sm = tf.data.Dataset.from_tensor_slices((train_features_sm, tf_train_hf_small["label"]))
train_tf_hf_sm = train_tf_hf_sm.shuffle(len(tf_train_hf_small)).batch(8)

test_features = {x: tf_test_hf[x].to_tensor() for x in tokenizer.model_input_names}
test_tf_hf = tf.data.Dataset.from_tensor_slices((test_features, tf_test_hf["label"]))
test_tf_hf = test_tf_hf.shuffle(len(tf_test_hf)).batch(8)

#val_features = {x: tf_val_hf[x].to_tensor() for x in tokenizer.model_input_names}
#val_tf_hf = tf.data.Dataset.from_tensor_slices((val_features, tf_val_hf["label"]))
#val_tf_hf = val_tf_hf.batch(8)

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(train_tf_hf_sm, validation_data=test_tf_hf, epochs=3)