<a href="https://colab.research.google.com/github/diyaaa19/sentiment-analysis/blob/main/Sentiment_Analysis_using_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
imdb_dataset_of_50k_movie_reviews_path = kagglehub.dataset_download('lakshmi25npathi/imdb-dataset-of-50k-movie-reviews')

print('Data source import complete.')


Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.
Data source import complete.


## Load Dataset

In [3]:
# Import libraries
import pandas as pd
import numpy as np

In [4]:
# import data
data = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
reviews = data['review']
sentiments = list(data['sentiment'])

## Clean Text data

In [7]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
CLEANR = re.compile('<.*?>')

def clean(review):
    review = re.sub(CLEANR, '', review) # remove html tags
    review = re.sub('[^a-zA-Z ]', '', review)
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(i) for i in review]
    return ' '.join(review)

In [None]:
import nltk
nltk.download('wordnet')
reviews = reviews.apply(clean)
reviews[:10]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
y = pd.get_dummies(sentiments)['positive']

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(list(reviews), y, test_size=0.2, random_state=0)

## Setup Transformers

In [None]:
!pip install -q transformers

In [None]:
import transformers
transformers.__version__

In [None]:
import tensorflow as tf
tf.__version__

In [None]:
# Tokenization
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(x_train,
                            truncation=True,
                            padding=True)

val_encodings = tokenizer(x_test,
                            truncation=True,
                            padding=True)

In [None]:
# convert to tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    tf.cast(y_train, tf.int32) # Convert boolean to int
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    tf.cast(y_test, tf.int32) # Convert boolean to int
))

## Load pre-trained model

In [None]:
# Load Model
from transformers import TFDistilBertForSequenceClassification

model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2, force_download=True, use_safetensors=False)

## Train model

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

# start training model
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=2,
          validation_data=val_dataset.shuffle(100).batch(16))

In [None]:
# train for more 2 epochs
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=2,
          validation_data=val_dataset.shuffle(100).batch(16))

## Save model

In [None]:
model.save_pretrained("./sentiment_custom_model")

## Use saved model for prediction

In [None]:
loaded_model = TFDistilBertForSequenceClassification.from_pretrained("./sentiment_custom_model")

In [None]:
test_sentence = x_test[4]
test_sentence

In [None]:
predict_input = tokenizer.encode(test_sentence,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")
tf_output = loaded_model.predict(predict_input)[0]

In [None]:
tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]
tf_prediction

Positive Review