In [None]:
import os
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np


In [None]:

os.environ["KERAS_BACKEND"] = "tensorflow"

# Load your data
df = pd.read_parquet('..\Project Main\data\Appliance_file_subset.parquet')
df.drop(['images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'], axis=1, inplace=True)

# Combine the title and text into one feature
df['combined_text'] = df['title'] + ' ' + df['text']

# Prepare tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)

# Tokenize the text data
def tokenize_function(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="tf"
    )

# Prepare the inputs
X = df['combined_text'].tolist()
y = df['rating'].astype(int) - 1  # Ratings should be 0-based for the model (e.g., 0 for 1-star, 4 for 5-star)

# Tokenize the text inputs
X_tokenized = tokenize_function(X)

# Convert TensorFlow tensors to NumPy arrays for compatibility with scikit-learn
X_input_ids = X_tokenized['input_ids'].numpy()
X_attention_mask = X_tokenized['attention_mask'].numpy()

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_input_ids, y, test_size=0.2, random_state=42)

# Convert labels to NumPy arrays as well
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()

# Compile the model using TensorFlow's Adam optimizer
model.compile(optimizer=Adam(learning_rate=5e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])

# Train the model
model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=3,
    batch_size=16
)

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation accuracy: {accuracy}')


In [1]:
import os
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from transformers import create_optimizer  # Use create_optimizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
import pandas as pd
import numpy as np

os.environ["KERAS_BACKEND"] = "tensorflow"

# Load your data
df = pd.read_parquet('..\Project Main\data\Appliance_file_subset.parquet')
df.drop(['images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'], axis=1, inplace=True)

# Combine the title and text into one feature
df['combined_text'] = df['title'] + ' ' + df['text']

# Prepare tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)

# Tokenize the text data
def tokenize_function(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="tf"
    )

# Prepare the inputs
X = df['combined_text'].tolist()
y = df['rating'].astype(int) - 1  # Ratings should be 0-based for the model (e.g., 0 for 1-star, 4 for 5-star)

# Tokenize the text inputs
X_tokenized = tokenize_function(X)

# Convert TensorFlow tensors to NumPy arrays for compatibility with scikit-learn
X_input_ids = X_tokenized['input_ids'].numpy()
X_attention_mask = X_tokenized['attention_mask'].numpy()

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_input_ids, y, test_size=0.2, random_state=42)

# Convert labels to NumPy arrays as well
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()

# Set optimizer parameters
batch_size = 16
num_epochs = 3
steps_per_epoch = len(X_train) // batch_size

# Use the create_optimizer function from transformers
optimizer, lr_schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=steps_per_epoch * num_epochs)

# Compile the model using the transformers-compatible optimizer
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])

# Train the model
model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=num_epochs,
    batch_size=batch_size
)

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation accuracy: {accuracy}')


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation accuracy: 0.06449999660253525
