#Import Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install -q opendatasets
!pip install -q kaggle



In [3]:
import opendatasets as od
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import RobertaModel, RobertaTokenizer

import re
import bz2

In [4]:
from tqdm import tqdm #To get progress while training.
from torch.utils.data import Dataset, DataLoader
import json

In [5]:
# Set the processing to GPU if possible.
from torch import cuda
device =  'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [6]:
from sklearn.model_selection import train_test_split


#Data Cleaning and Preprocessing

In [7]:
json_file_path = "/content/drive/MyDrive/Datasets/Software/Software.json"
with open(json_file_path) as f:
    data = [json.loads(line) for line in f]

# Create a DataFrame
df = pd.DataFrame(data)

# Select only the desired columns
df = df[['overall', 'reviewText', 'summary']]


In [8]:
df.head()

Unnamed: 0,overall,reviewText,summary
0,4.0,The materials arrived early and were in excell...,Material Great
1,4.0,I am really enjoying this book with the worksh...,Health
2,1.0,"IF YOU ARE TAKING THIS CLASS DON""T WASTE YOUR ...",ARE YOU KIDING ME?
3,3.0,This book was missing pages!!! Important pages...,missing pages!!
4,5.0,I have used LearnSmart and can officially say ...,Best study product out there!


In [9]:
df.isnull().sum()

overall        0
reviewText    66
summary       56
dtype: int64

In [10]:
df = df.dropna()

In [11]:
df.isnull().sum()

overall       0
reviewText    0
summary       0
dtype: int64

In [12]:
df['overall'].value_counts()

5.0    212374
1.0    102528
4.0     73586
3.0     39390
2.0     31442
Name: overall, dtype: int64

In [13]:
# Define the desired counts for each label
counts = {'1.0': 5000, '2.0': 5000, '3.0': 10000, '4.0': 5000, '5.0': 5000}

new_df = pd.DataFrame(columns=df.columns)

# Sample data for each label and concatenate into the new DataFrame
for label, count in counts.items():
    label_df = df[df['overall'] == float(label)]
    sampled_df = label_df.sample(n=count, replace=True, random_state=42)
    new_df = pd.concat([new_df, sampled_df], ignore_index=True)

# Display the value counts of the new DataFrame
print(new_df['overall'].value_counts())

3.0    10000
1.0     5000
2.0     5000
4.0     5000
5.0     5000
Name: overall, dtype: int64


In [14]:
# Assume that 0 -> Negative, 1 -> 'Neutral' and 2 -> 'Positive'
label_mapping = {1.0: 0, 2.0: 0, 3.0: 1, 4.0: 2, 5.0: 2}

new_df['overall'] = new_df['overall'].replace(label_mapping)
print(new_df['overall'].value_counts())

0.0    10000
1.0    10000
2.0    10000
Name: overall, dtype: int64


In [15]:
def normalize_texts(df, text_column):
    # Exclude all non-alphanumeric characters except comma and dot
    NON_ALPHANUM = re.compile(r'[^a-z0-9,.\s]')

    # Exclude all characters that are not lowercase letters, digits, or whitespace
    NON_ASCII = re.compile(r'[\x20-\x7E]+')

    normalized_texts = []
    for text in df[text_column]:
        ascii_chars = NON_ASCII.findall(text)
        lower_text = ascii_chars[0].lower()
        alphanumeric_text = NON_ALPHANUM.sub(r'', lower_text)
        normalized_texts.append(alphanumeric_text)

    return normalized_texts

new_df['reviewText'] = normalize_texts(new_df, 'reviewText')
new_df['summary'] = normalize_texts(new_df, 'summary')


In [16]:
new_df.drop(labels='summary',axis=1, inplace=True)
new_df.head()

Unnamed: 0,overall,reviewText
0,0.0,program does not work with apple computer as a...
1,0.0,this product has disappointed me several times...
2,0.0,never ordered
3,0.0,awful. others have given the long list of issu...
4,0.0,unable to register software with serial number...


In [17]:
X,y = new_df['reviewText'].values,new_df['overall'].values

In [18]:
y

array([0., 0., 0., ..., 2., 2., 2.])

In [19]:
x_train, x_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Split the temporary set into testing and validation sets
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print("Training set shape:", x_train.shape, y_train.shape)
print("Testing set shape:", x_test.shape, y_test.shape)
print("Validation set shape:", x_val.shape, y_val.shape)

Training set shape: (21000,) (21000,)
Testing set shape: (4500,) (4500,)
Validation set shape: (4500,) (4500,)


In [20]:
type(x_train)

numpy.ndarray

#Model

In [21]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
# EPOCHS = 1
LEARNING_RATE = 1e-05

# Use the pretrained Roberta Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [22]:
class SentimentData():
  '''
  Custom class for handling sentiment data. To be able to pass to the DataLoader.
  __getitem__ function do the tokenization for each text sample.

  '''
  def __init__(self, x, y, tokenizer, max_len):
      self.tokenizer = tokenizer
      self.text = x
      self.targets = y
      self.max_len = max_len

  def __len__(self):
      return len(self.text)

  def __getitem__(self, index):
      text = str(self.text[index])
      text = " ".join(text.split())

      inputs = self.tokenizer.encode_plus(
          text,
          None,
          add_special_tokens=True,
          max_length=self.max_len,
          pad_to_max_length=True,
          return_token_type_ids=True,
          truncation=True
      )
      ids = inputs['input_ids']
      mask = inputs['attention_mask']
      token_type_ids = inputs["token_type_ids"]


      return {
          'ids': torch.tensor(ids, dtype=torch.long),
          'mask': torch.tensor(mask, dtype=torch.long),
          'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
          'targets': torch.tensor(self.targets[index], dtype=torch.float)
      }

In [23]:
training_set = SentimentData(x_train, y_train, tokenizer, MAX_LEN)
testing_set = SentimentData(x_train, y_train, tokenizer, MAX_LEN)
validation_set = SentimentData(x_val, y_val, tokenizer, MAX_LEN)

In [24]:
# training_set.text = training_set.text.reset_index(drop=True)
# training_set.targets = training_set.targets.reset_index(drop=True)


In [25]:
# print(type(training_set))
# print(vars(training_set))

In [26]:
# Assuming sentiment_data is your SentimentData object
print(dir(training_set))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'max_len', 'targets', 'text', 'tokenizer']


In [27]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

test_params = {'batch_size': 1,
                'shuffle': False,
                'num_workers': 0
                }

validation_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
validation_loader = DataLoader(validation_set, **validation_params)

In [28]:
print(type(training_loader))
print(vars(training_loader))

<class 'torch.utils.data.dataloader.DataLoader'>
{'dataset': <__main__.SentimentData object at 0x7b18ab6573d0>, 'num_workers': 2, 'prefetch_factor': 2, 'pin_memory': False, 'pin_memory_device': '', 'timeout': 0, 'worker_init_fn': None, '_DataLoader__multiprocessing_context': None, '_dataset_kind': 0, 'batch_size': 32, 'drop_last': False, 'sampler': <torch.utils.data.sampler.RandomSampler object at 0x7b1780035b10>, 'batch_sampler': <torch.utils.data.sampler.BatchSampler object at 0x7b17800368f0>, 'generator': None, 'collate_fn': <function default_collate at 0x7b17ebe20280>, 'persistent_workers': False, '_DataLoader__initialized': True, '_IterableDataset_len_called': None, '_iterator': None}


In [29]:
class RobertaClass(torch.nn.Module):
  '''
  Custom PyTorch module for sentiment analysis using a fine-tuned RoBERTa model.
  - l1: Pre-trained RoBERTa model loaded from "roberta-base" using Hugging Face Transformers.
  - pre_classifier: Linear layer for additional transformation before classification.
  - dropout: Dropout layer for regularization.
  - classifier: Linear layer for final sentiment classification.
  '''
  def __init__(self):
      super(RobertaClass, self).__init__()
      self.l1 = RobertaModel.from_pretrained("roberta-base")
      self.pre_classifier = torch.nn.Linear(768, 768)
      self.dropout = torch.nn.Dropout(0.3)
      self.classifier = torch.nn.Linear(768, 3)

  def forward(self, input_ids, attention_mask, token_type_ids):
      output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
      hidden_state = output_1[0]
      pooler = hidden_state[:, 0]
      pooler = self.pre_classifier(pooler)
      pooler = torch.nn.ReLU()(pooler)
      pooler = self.dropout(pooler)
      output = self.classifier(pooler)
      return output

In [30]:
model = RobertaClass()
model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [31]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [32]:
def calculate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [33]:
# Validation function
def validate(model, validation_loader, loss_function, device):
    model.eval()
    val_loss = 0
    n_correct = 0
    nb_val_steps = 0
    nb_val_examples = 0

    with torch.no_grad():
        for _, data in tqdm(enumerate(validation_loader, 0)):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.long)

            outputs = model(ids, mask, token_type_ids)
            loss = loss_function(outputs, targets)
            val_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calculate_accuracy(big_idx, targets)

            nb_val_steps += 1
            nb_val_examples += targets.size(0)

    val_accuracy = (n_correct * 100) / nb_val_examples
    val_loss /= nb_val_steps

    print(f'Validation Loss: {val_loss}')
    print(f'Validation Accuracy: {val_accuracy}%')
    print()

    return

In [34]:
# Training loop
def train(model, training_loader, validation_loader, loss_function, optimizer, device, epochs=5):
    for epoch in range(epochs):
        tr_loss = 0
        n_correct = 0
        nb_tr_steps = 0
        nb_tr_examples = 0
        model.train()

        for _, data in tqdm(enumerate(training_loader, 0)):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.long)

            outputs = model(ids, mask, token_type_ids)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calculate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples += targets.size(0)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print()
        print(f'Training Epoch {epoch + 1}')
        print(f'Training Loss: {tr_loss / nb_tr_steps}')
        print(f'Training Accuracy: {(n_correct * 100) / nb_tr_examples}%')

        # Validation
        val_accuracy = validate(model, validation_loader, loss_function, device)
    return

In [None]:
train(model, training_loader, validation_loader, loss_function, optimizer, device, epochs=5)

657it [13:36,  1.24s/it]


Training Epoch 1
Training Loss: 0.7625234361562192
Training Accuracy: 64.49047619047619%



282it [00:58,  4.84it/s]

Validation Loss: 0.7585294572174126
Validation Accuracy: 65.2%




410it [08:33,  1.25s/it]

In [None]:
# # Training loop
# def train(epoch):
#     tr_loss = 0
#     n_correct = 0
#     nb_tr_steps = 0
#     nb_tr_examples = 0
#     model.train()
#     for _,data in tqdm(enumerate(training_loader, 0)):
#         ids = data['ids'].to(device, dtype = torch.long)
#         mask = data['mask'].to(device, dtype = torch.long)
#         token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
#         targets = data['targets'].to(device, dtype = torch.long)

#         outputs = model(ids, mask, token_type_ids)
#         loss = loss_function(outputs, targets)
#         tr_loss += loss.item()
#         big_val, big_idx = torch.max(outputs.data, dim=1)
#         n_correct += calcuate_accuracy(big_idx, targets)

#         nb_tr_steps += 1
#         nb_tr_examples+=targets.size(0)

#         if _%500==0:
#             loss_step = tr_loss/nb_tr_steps
#             accu_step = (n_correct*100)/nb_tr_examples
#             print(f"Training Loss per 500 steps: {loss_step}")
#             print(f"Training Accuracy per 500 steps: {accu_step}")
#             print()

#         optimizer.zero_grad()
#         loss.backward()
#         # # When using GPU
#         optimizer.step()

#     print()
#     print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
#     epoch_loss = tr_loss/nb_tr_steps
#     epoch_accu = (n_correct*100)/nb_tr_examples
#     print(f"Training Loss for Epoch {epoch}: {epoch_loss}")
#     print(f"Training Accuracy for Epoch {epoch}: {epoch_accu}")
#     print()

#     return

In [None]:
# def valid(model, testing_loader):
#     model.eval()
#     n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
#     with torch.no_grad():
#         for _, data in tqdm(enumerate(testing_loader, 0)):
#             ids = data['ids'].to(device, dtype = torch.long)
#             mask = data['mask'].to(device, dtype = torch.long)
#             token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
#             targets = data['targets'].to(device, dtype = torch.long)
#             outputs = model(ids, mask, token_type_ids).squeeze()
#             loss = loss_function(outputs, targets)
#             tr_loss += loss.item()
#             big_val, big_idx = torch.max(outputs.data, dim=1)
#             n_correct += calcuate_accuracy(big_idx, targets)

#             nb_tr_steps += 1
#             nb_tr_examples+=targets.size(0)

#             if _%5000==0:
#                 loss_step = tr_loss/nb_tr_steps
#                 accu_step = (n_correct*100)/nb_tr_examples
#                 print(f"Validation Loss per 100 steps: {loss_step}")
#                 print(f"Validation Accuracy per 100 steps: {accu_step}")
#     epoch_loss = tr_loss/nb_tr_steps
#     epoch_accu = (n_correct*100)/nb_tr_examples
#     print(f"Validation Loss Epoch: {epoch_loss}")
#     print(f"Validation Accuracy Epoch: {epoch_accu}")

#     return epoch_accu


In [None]:
# acc = valid(model, testing_loader)
# print("Accuracy on test data = %0.2f%%" % acc)

1it [00:00,  1.22it/s]

Validation Loss per 100 steps: 0.44826605916023254
Validation Accuracy per 100 steps: 100.0


3938it [14:27,  4.54it/s]

Validation Loss Epoch: 0.49943950123339514
Validation Accuracy Epoch: 79.98095238095237
Accuracy on test data = 79.98%





In [None]:
import os

In [None]:
model_file = 'pytorch_roberta_sentiment_3.bin'
vocab_file = './'
models_folder = '/content/drive/MyDrive/Models'

output_model_file = os.path.join(models_folder, model_file)
output_vocab_file = os.path.join(models_folder, vocab_file)

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

KeyboardInterrupt: 

# Inferencing

In [None]:
model = RobertaClass()
model.load_state_dict()

In [None]:
from pathlib import Path

# Accepts a text as a list and calculate the label. Save the results after that.
def inference(model, tokenizer, text_list, max_len=MAX_LEN):
  model.to(device)
  model.eval()

  normalize_text = normalize_texts(text_list)
  inference_set = SentimentData(pd.DataFrame({'Text': normalize_text}), tokenizer, MAX_LEN)

  # DataLoader for inference
  inference_params = {'batch_size': 1,
                      'shuffle': False,
                      'num_workers': 0}

  inference_loader = DataLoader(inference_set, **inference_params)

  # Perform inference on the list of texts
  results = []
  for batch in inference_loader:
      input_ids = batch['ids']
      attention_mask = batch['mask']
      token_type_ids = batch['token_type_ids']

      with torch.no_grad():
          output = model(input_ids, attention_mask, token_type_ids)

      predicted_class = torch.argmax(output, dim=1).item()
      results.append({'Text': batch['text'][0], 'Predicted Class': predicted_class})

  result_df = pd.DataFrame(results)

  # Save the DataFrame to a CSV file in the specified folder
  result_df.to_csv(Path('/content/drive/MyDrive/ROBERTa') / 'inference_results.csv', index=False)



In [None]:

inference(model, tokenizer, text_list)