In [None]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
import h5py
from tensorflow.python.keras.saving import hdf5_format
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
# from keras.saving.hdf5_format import save_attributes_to_hdf5_group

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df = train_df.dropna()

In [None]:
x_train = train_df['Review'].values
y_train = train_df['overall'].values -1

In [None]:
# Tokenizer and pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tf_bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

max_length = 128  # Maximum sequence length
X = train_df['Review'].tolist()
y = y_train.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

batch_size = 32
epochs = 3

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
tf_bert_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

history = tf_bert_model.fit(train_dataset.shuffle(1000).batch(batch_size),
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=test_dataset.batch(batch_size))

test_loss, test_accuracy = tf_bert_model.evaluate(test_dataset.batch(batch_size))
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')
