In [1]:
!pip install -U transformers

In [2]:
!pip install -U tensorflow_datasets

In [3]:
import tensorflow as tf

In [4]:
gpus_available = len(tf.config.experimental.list_physical_devices('GPU'))
print('GPU Available: ', gpus_available)
assert gpus_available > 0

In [5]:
from transformers import DistilBertTokenizerFast # DistilBERT
from transformers import TFDistilBertForSequenceClassification
import pandas as pd
import numpy as np

### NLP Based Tools


In [6]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## Amazon Review Dataset

In [7]:
import tensorflow_datasets as tfds
ds = tfds.load('amazon_us_reviews/Mobile_Electronics_v1_00', split='train', shuffle_files=True)

assert isinstance(ds, tf.data.Dataset) # Check if dataset if Wrapped under tf.data.Dataset
print(ds)

In [8]:
df = tfds.as_dataframe(ds)
df.head()

In [9]:
# Giving negative and positive to Sentiment
df["Sentiment"] = df["data/star_rating"].apply(lambda score: "positive" if score >= 3 else "negative")
df['Sentiment'] = df['Sentiment'].map({'positive':1, 'negative':0})

In [10]:
df['short_review'] = df['data/review_body'].str.decode('utf-8')

In [11]:
df = df[['short_review', 'Sentiment']]

In [12]:
df.head()

In [13]:
# Drop Last N rows 
N = 54975
df.drop(df.tail(N).index, inplace=True)

In [14]:
index = df.index
no_of_rows = len(index)
print(no_of_rows)

In [15]:
df.tail()

In [16]:
# Split into reviews and labels
reviews = df['short_review'].values.tolist()
labels = df['Sentiment'].tolist()

In [17]:
print(reviews[:2])

In [18]:
print(labels[:2])

In [19]:
# Split into train and test
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(reviews, labels, test_size=0.2)

### Tokenizer
We are using `DistilBERTTokenizer` as from pretrained `DistilBert` Model.

In [20]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [21]:
tokenizer([train_texts[0]], truncation=True, padding=True, max_length=128)

### Prepare Encodings

In [22]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [23]:
# Make Dataset from these Encodings
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))

val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels))

### Model
A Pretrained `DistilBERT` is used for this. From the Transformers module we use `TFDistilBertForSequenceClassification`.

In [24]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 2)

In [25]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

model.fit(train_dataset.shuffle(100).batch(16), epochs=2, batch_size=16, validation_data=val_dataset.shuffle(100).batch(16))

In [26]:
model.save_pretrained('./sentiment')

In [27]:
loaded_model = TFDistilBertForSequenceClassification.from_pretrained('./sentiment')

In [29]:
test_sentence = "This is a really good product. I really love it."

predict_input = tokenizer.encode(test_sentence, truncation=True, padding=True, return_tensors="tf")
tf_output = loaded_model.predict(predict_input)[0]

tf_prediction = tf.nn.softmax(tf_output, axis=1)
labels = ['Negative', 'Positive']
label = tf.argmax(tf_prediction, axis=1)
label = label.numpy()
print(labels[label[0]])

In [None]:
,