In [None]:
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from sklearn.model_selection import train_test_split
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

# define characters to remove and stop words
regex = re.compile("[^a-zA-Z ]")
stop_words = set(stopwords.words('english'))

# function to clean text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = regex.sub(' ', text) # Substitute everything that is not a letter with an empty string
    words = word_tokenize(text) # tokenize text
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)


# GPU check
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Load your data
df = pd.read_parquet('../Project Main/data/Appliance_file_subset.parquet')
df.drop(['images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'], axis=1, inplace=True)
#combined_text = df['title'] + df['text']
df['combined_text'] = [preprocess_text(title + text) for title, text in zip(df['title'], df['text'])]


In [None]:
# Prepare tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)

# Tokenize the text data
def tokenize_function(texts):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors="tf"
    )

# Prepare the inputs and labels
X = df['combined_text'].tolist()
y = tf.convert_to_tensor(df['rating'].astype(int).values - 1, dtype=tf.int32)  # Ensure labels are TensorFlow-compatible
X_tokenized = tokenize_function(X)

# Convert input_ids and attention_mask to NumPy arrays
input_ids = X_tokenized['input_ids'].numpy()
attention_mask = X_tokenized['attention_mask'].numpy()
y = y.numpy()

# Split the dataset into training and validation sets
X_train_ids, X_val_ids, X_train_mask, X_val_mask, y_train, y_val = train_test_split(
    input_ids, attention_mask, y, test_size=0.2, random_state=42
)

# Convert the data to a TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': X_train_ids, 'attention_mask': X_train_mask}, y_train)).batch(16)
val_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': X_val_ids, 'attention_mask': X_val_mask}, y_val)).batch(16)

# Compile the model
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train the model (inside GPU context)
with tf.device('/GPU:0'):
    model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=3
    )

# Evaluate the model
loss, accuracy = model.evaluate(val_dataset)
print(f'Validation accuracy: {accuracy}')

In [None]:
# save the model
model.save('../../trained_model')