# Guide to use RoBERTa model using Transformers

In [1]:
# Install the transformers library
!pip install datasets transformers accelerate



## Import Required modules

In [2]:
# Import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from transformers import pipeline

2025-01-29 21:34:43.348614: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-29 21:34:43.541800: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738175683.618929    7008 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738175683.641536    7008 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-29 21:34:43.824315: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

## Approach-1 Using pipeline

In [4]:
sentiment_task = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


In [5]:
sentiment = sentiment_task("I watched Oppenheimer movie, one the finest and incredible movie")
sentiment

[{'label': 'POSITIVE', 'score': 0.9998806715011597}]

In [6]:
sentiment_neg = sentiment_task("This movie is worst and terrible")
sentiment_neg

[{'label': 'NEGATIVE', 'score': 0.9998193383216858}]

## Tokenization and Model initialization

In [7]:
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

## Dataset preparation

- Text to Tokens
- Tokens to Unique IDS
- Map Tokens with unique IDs in a dictionary

In [8]:
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts

    def __len__(self):
        return len(self.tokenized_texts["input_ids"])

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

## Approach -2 Sample User Text- Using pre-trained models

In [9]:
tweets = [
    "Just had the best meal ever at my favorite restaurant! The food was amazing, and the service was top-notch. Feeling so satisfied and happy right now! 😃🍔🥗 #Foodie #HappyCustomer",
    "Feeling really frustrated after sitting in traffic for hours today. Missed an important meeting and wasted so much time. Not a great start to the day. 😤🚗 #TrafficWoes #Stressed"
]

## Tokenize the user input text

In [10]:
tokenized_texts = tokenizer(tweets,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

## Create Model Trainer

In [11]:
trainer = Trainer(model=model)

In [12]:
predictions = trainer.predict(pred_dataset) #run predictions

## Model Evaluation- Predictions

In [13]:
preds = predictions.predictions.argmax(-1) #get prediction labels
labels = pd.Series(preds).map(model.config.id2label)  #convert prediction into POSITIVE or NEGATIVE
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1) # ACCURACY SCORE

In [14]:
scores

array([0.9989399 , 0.99949604], dtype=float32)

In [15]:
labels

0    POSITIVE
1    NEGATIVE
dtype: object

## RoBERTa Sentiment results on user input

In [16]:
df = pd.DataFrame(list(zip(tweets,preds,labels,scores)), columns=['text','pred','label','score'])
df.head()

Unnamed: 0,text,pred,label,score
0,Just had the best meal ever at my favorite res...,1,POSITIVE,0.99894
1,Feeling really frustrated after sitting in tra...,0,NEGATIVE,0.999496


## Approach-3 On Custom Dataset

Download the dataset and upload the dataset in the Google Colab to proceed further

Dataset: [Twitter Sentiment Analysis Dataset](https://www.kaggle.com/datasets/yasserh/twitter-tweets-sentiment-dataset)

In [None]:
# Alternatively, you can use the `kagglehub` package to download the dataset
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yasserh/twitter-tweets-sentiment-dataset")

print("Path to dataset files:", path)

In [19]:
data = pd.read_csv("./data/Tweets.csv")
data.tail()

Unnamed: 0,textID,text,selected_text,sentiment
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive
27480,6f7127d9d7,All this flirting going on - The ATG smiles...,All this flirting going on - The ATG smiles. Y...,neutral


In [20]:
data_tweets = data['text'].dropna().astype('str').tolist()

In [21]:
tokenized_data = tokenizer(data_tweets,truncation=True,padding=True)
pred_data = SimpleDataset(tokenized_data)

In [22]:
evaluate = trainer.predict(pred_data) #will take some time

In [23]:
new_preds = evaluate.predictions.argmax(-1) #get prediction labels
new_labels = pd.Series(new_preds).map(model.config.id2label) #convert prediction into POSITIVE or NEGATIVE
new_scores = (np.exp(evaluate[0])/np.exp(evaluate[0]).sum(-1,keepdims=True)).max(1) # ACCURACY SCORE

In [24]:
final_df = pd.DataFrame(list(zip(data_tweets,new_preds,new_labels,new_scores)), columns=['tweets','target','sentiment','accuracy'])
final_df.head(10)

Unnamed: 0,tweets,target,sentiment,accuracy
0,"I`d have responded, if I were going",0,NEGATIVE,0.994793
1,Sooo SAD I will miss you here in San Diego!!!,0,NEGATIVE,0.995364
2,my boss is bullying me...,0,NEGATIVE,0.999434
3,what interview! leave me alone,0,NEGATIVE,0.997404
4,"Sons of ****, why couldn`t they put them on t...",0,NEGATIVE,0.998303
5,http://www.dothebouncy.com/smf - some shameles...,1,POSITIVE,0.994551
6,2am feedings for the baby are fun when he is a...,1,POSITIVE,0.997768
7,Soooo high,1,POSITIVE,0.997417
8,Both of you,0,NEGATIVE,0.829039
9,Journey!? Wow... u just became cooler. hehe....,1,POSITIVE,0.99725
