In [None]:
pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import tensorflow as tf
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer


In [None]:
# Load the data
df = pd.read_csv(r"/content/drive/MyDrive/amazon_reviews.csv", encoding='unicode_escape')


In [None]:
# Remove missing values
df = df.dropna(subset=['Review', 'Rate'])


In [None]:
# Convert 'Rate' column to numeric
df['Rate'] = pd.to_numeric(df['Rate'], errors='coerce')


In [None]:
# Define features and target
X = df['Review']
y = df['Rate']

# Convert ratings to sentiment labels
sentiment_labels = []
for rating in y:
    if rating >= 4:
        sentiment_labels.append('positive')
    elif rating <= 2:
        sentiment_labels.append('negative')
    else:
        sentiment_labels.append('neutral')


In [None]:
# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the text
encodings = tokenizer(list(X), truncation=True, padding=True)


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
# Convert labels to IDs
label2id = {'positive': 2, 'neutral': 1, 'negative': 0}
labels = [label2id[label] for label in sentiment_labels]


In [None]:
# Create TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((dict(encodings), labels)).shuffle(len(encodings))


In [None]:
# Initialize the model
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)


Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [None]:
# Define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)


In [None]:
# Compile the model
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])


In [None]:
# Train the model
model.fit(dataset.batch(16), epochs=1)


ResourceExhaustedError: ignored

In [None]:
# Input a review text for prediction
review = "This product is very good."


In [None]:
# Tokenize the input review
input_encoding = tokenizer.encode_plus(
    review,
    truncation=True,
    padding=True,
    return_tensors='tf'
)

In [None]:
# Predict the sentiment of the input review
input_ids = input_encoding['input_ids']
attention_mask = input_encoding['attention_mask']
predictions = model.predict({'input_ids': input_ids, 'attention_mask': attention_mask})
predicted_label = tf.argmax(predictions.logits, axis=1).numpy()[0]
predicted_sentiment = list(label2id.keys())[list(label2id.values()).index(predicted_label)]




In [None]:
# Print the predicted sentiment
print(f"Predicted Sentiment: {predicted_sentiment}")

Predicted Sentiment: positive
