<a href="https://colab.research.google.com/github/devan1510/machine-learning-projects/blob/main/sentiment_analysis_using_bert_on_standford_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# sentiment analysis using BERT on movie reviews

In [None]:
import os
import shutil
import tarfile
from bs4 import BeautifulSoup
# tensorflow imports
import tensorflow as tf
from transformers import BertTokenizer,TFBertForSequenceClassification
# language preprocessing
import re
from wordcloud import WordCloud,STOPWORDS
# scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# imports for the project
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# plotly imports
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go

In [None]:
# get the current folder
current_folder= os.getcwd()
dataset= tf.keras.utils.get_file(fname ="aclImdb.tar.gz",
        origin ="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
        cache_dir=  current_folder,extract = True)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [None]:
dataset_path = os.path.dirname(dataset)
# Check the dataset
os.listdir(dataset_path)

['aclImdb.tar.gz', 'aclImdb']

In [None]:
# Dataset directory
dataset_dir = os.path.join(dataset_path, 'aclImdb')

# Check the Dataset directory
os.listdir(dataset_dir)

['train', 'test', 'imdb.vocab', 'README', 'imdbEr.txt']

In [None]:
# check the train dataset
train_dir = os.path.join(dataset_dir,'train')
os.listdir(train_dir)

['urls_pos.txt',
 'urls_neg.txt',
 'neg',
 'unsup',
 'urls_unsup.txt',
 'unsupBow.feat',
 'labeledBow.feat',
 'pos']

In [None]:
# read the files of train files
for file in os.listdir(train_dir):
    file_path = os.path.join(train_dir, file)
    # Check if it's a file (not a directory)
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            first_value = f.readline().strip()
            print(f"{file}: {first_value}")
    else:
        print(f"{file}: {file_path}")

urls_pos.txt: http://www.imdb.com/title/tt0453418/usercomments
urls_neg.txt: http://www.imdb.com/title/tt0064354/usercomments
neg: /content/datasets/aclImdb/train/neg
unsup: /content/datasets/aclImdb/train/unsup
urls_unsup.txt: http://www.imdb.com/title/tt0018515/usercomments
unsupBow.feat: 0 0:8 1:6 3:5 4:2 5:1 7:1 8:5 9:2 10:1 11:2 13:3 16:1 17:1 18:1 19:1 22:3 24:1 26:3 28:1 30:1 31:1 35:2 36:1 39:2 40:1 41:2 46:2 47:1 48:1 52:1 63:1 67:1 68:1 74:1 81:1 83:1 87:1 104:1 105:1 112:1 117:1 131:1 151:1 155:1 170:1 198:1 225:1 226:1 288:2 291:1 320:1 331:1 342:1 364:1 374:1 384:2 385:1 407:1 437:1 441:1 465:1 468:1 470:1 519:1 595:1 615:1 650:1 692:1 851:1 937:1 940:1 1100:1 1264:1 1297:1 1317:1 1514:1 1728:1 1793:1 1948:1 2088:1 2257:1 2358:1 2584:2 2645:1 2735:1 3050:1 4297:1 5385:1 5858:1 7382:1 7767:1 7773:1 9306:1 10413:1 11881:1 15907:1 18613:1 18877:1 25479:1
labeledBow.feat: 9 0:9 1:1 2:4 3:4 4:6 5:4 6:2 7:2 8:4 10:4 12:2 26:1 27:1 28:1 29:2 32:1 41:1 45:1 47:1 50:1 54:2 57:1 59:

In [None]:
#Load the Movies reviews
# and convert them into the pandas’ data frame with their respective sentiment
def load_dataset(directory):
    data = {"sentence": [], "sentiment": []}
    for file_name in os.listdir(directory):
        print(file_name)
        if file_name == 'pos':
            positive_dir = os.path.join(directory, file_name)
            for text_file in os.listdir(positive_dir):
                text = os.path.join(positive_dir, text_file)
                with open(text, "r", encoding="utf-8") as f:
                    data["sentence"].append(f.read())
                    data["sentiment"].append(1)
        elif file_name == 'neg':
            negative_dir = os.path.join(directory, file_name)
            for text_file in os.listdir(negative_dir):
                text = os.path.join(negative_dir, text_file)
                with open(text, "r", encoding="utf-8") as f:
                    data["sentence"].append(f.read())
                    data["sentiment"].append(0)

    return pd.DataFrame.from_dict(data)

In [None]:
# load the dataset
df = load_dataset(train_dir)
df.head()

urls_pos.txt
urls_neg.txt
neg
unsup
urls_unsup.txt
unsupBow.feat
labeledBow.feat
pos


Unnamed: 0,sentence,sentiment
0,From it's uninspiring title to the flat acting...,0
1,I should have figured that any movie with the ...,0
2,This is a pretty bad movie. The plot is sentim...,0
3,"In the 60's, having as the background the rehe...",0
4,"This movie's heart was in the right place, no ...",0


In [None]:
# here 0--- negative and 1--- positive
test_dir = os.path.join(dataset_dir,'test')

# Load the dataset from the train_dir
test_df = load_dataset(test_dir)
test_df.head()

urls_pos.txt
urls_neg.txt
neg
labeledBow.feat
pos


Unnamed: 0,sentence,sentiment
0,***SPOILERS!*** I sometimes wonder what makes ...,0
1,I am a huge fan of the original Assault On Pre...,0
2,"OK, before I get into this, let's go ahead and...",0
3,"I bought the DVD out of a big bin for $4.99, t...",0
4,I'm sure that any legitimate submariner would ...,0


In [None]:
# text cleaning
# prepare a function for text cleaning
def text_cleaning(text):
  html_parser= BeautifulSoup(text,"html.parser")
  text= re.sub(r'\[[^]]*\]', '', html_parser.get_text())
  pattern= r"[^a-zA-Z0-9\s,']"
  text= re.sub(pattern,'',text)
  return text


In [None]:
### apply text cleaning
# train dataset
df["sentence"]= df.sentence.apply(text_cleaning).tolist()
# test dataset
test_df["sentence"]= test_df.sentence.apply(text_cleaning)


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



In [None]:
# training data
reviews= df['sentence']
target= df['sentiment']

# testing data
test_reviews= test_df.sentence
test_targets= test_df.sentiment

In [None]:
X_val,X_test,y_val,y_test= train_test_split(test_reviews,
                                            test_targets,
                                            test_size=0.5,
                                            stratify = test_targets)

# **tokenisation and encoding**

In [None]:
# tokenize and encode the data using BERT tokenizer
bert_tokenizer= BertTokenizer.from_pretrained("bert-base-uncased",do_lower_case= True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



In [None]:
# set the maximum length of sentence to tokenize
max_len= 60
X_train= bert_tokenizer.batch_encode_plus(reviews.tolist(),
                                          padding= True,
                                          truncation= True,
                                          max_length= max_len,
                                          return_tensors= "tf")
X_val= bert_tokenizer.batch_encode_plus(X_val.tolist(),
                                          padding= True,
                                          truncation= True,
                                          max_length= max_len,
                                          return_tensors= "tf")
X_test= bert_tokenizer.batch_encode_plus(X_test.tolist(),
                                        padding= True,
                                         truncation= True,
                                         max_length= max_len,
                                         return_tensors= "tf")

In [None]:
# Intialize the model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# compile the mdoel
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
# train the mdoel
model_history= model.fit([X_train['input_ids'], X_train['token_type_ids'], X_train['attention_mask']],
    target,
    validation_data=(
      [X_val['input_ids'], X_val['token_type_ids'], X_val['attention_mask']],y_val),
    batch_size=32,
    epochs=1)

  1/782 [..............................] - ETA: 19:51:08 - loss: 0.7863 - accuracy: 0.4375

KeyboardInterrupt: 

In [None]:
# make predictions on the testing set
predictions= model.predict([X_test_encoded['input_ids'], X_test_encoded['token_type_ids'], X_test_encoded['attention_mask']])

# predictions is of type TFSequenceClassifierOutput axis to get the predicted labels
logits= predictions.logits

# use argmax to get predicted labels
prediction_labels= [tf.argmax(i) for i in predictions].numpy()

#



In [None]:
def review_sentiment(review,tokenizer= bert_tokenizer,model= model):
  # convert review to a list
  review= list(review)

  # get input_ids,token_type_ids,attention_mask using tokenization
  input,token,mask= bert_tokenizer.batch_encode_plus(review,
                                                    padding= True,
                                                    truncation= True,
                                                # set the maximum words to tokenize
                                                    max_length= 128,
                                                   return_tensors= "tf").values()

  # make predictions of the mdoel
  pred= model.predict([input, token,mask])

  # make labels for the analysis
  label = {1: 'positive',0: 'Negative'}

  # get prediction labels
  lab= tf.argmax(pred.logits, axis= 1)
  labels= [label[i] for i in lab.numpy().tolist()]

  return labels


In [None]:
Review ='''Bahubali is a blockbuster Indian movie that was released in 2015.
It is the first part of a two-part epic saga that tells the story of a legendary hero who fights for his kingdom and his love.
The movie has received rave reviews from critics and audiences alike for its stunning visuals,
spectacular action scenes, and captivating storyline.'''
review_sentiment(Review)



['Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'positive',
 'positive',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'positive',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'positive',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'positive',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'positive',
 'Negative',
 'positive',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative',

In [None]:
import tensorflow as tf

def Get_sentiment(Review, Tokenizer=bert_tokenizer, Model=model):
    # Convert Review to a list if it's not already a list
    if not isinstance(Review, list):
        Review = [Review]

    # Tokenize the input reviews, padding and truncating as needed
    encoding = bert_tokenizer.batch_encode_plus(
        Review,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='tf'
    )

    # Extract the input IDs, token type IDs, and attention masks
    Input_ids = encoding['input_ids']
    Token_type_ids = encoding['token_type_ids']
    Attention_mask = encoding['attention_mask']

    # Get predictions from the model
    prediction = Model.predict([Input_ids, Token_type_ids, Attention_mask])
    label = {1: 'positive',0: 'Negative'}
    # Use argmax along the appropriate axis to get the predicted labels
    pred_labels = tf.argmax(prediction.logits, axis=1)

    # Convert TensorFlow tensor to a list of predicted sentiment labels
    pred_labels = [label[i] for i in pred_labels.numpy().tolist()]

    return pred_labels


In [None]:
Review ='''Bahubali is a blockbuster Indian movie that was released in 2015.
It is the first part of a two-part epic saga that tells the story of a legendary hero who fights for his kingdom and his love.
The movie has received rave reviews from critics and audiences alike for its stunning visuals,
spectacular action scenes, and captivating storyline.'''
Get_sentiment(Review)



['positive']