In [39]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
import nltk
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

In [40]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [41]:
df = pd.read_csv("./Reviews.csv")
df['Text'] = df['Text'].str.replace('[^\w\s]', '').str.lower()
# df['Text'] = df['Text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))

In [42]:
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [43]:
import numpy as np
import matplotlib.pyplot as plt
import nltk

In [44]:
# df['Score'].value_counts().sort_index().plot(kind='bar', title='Fine Food Reviews', figsize=(10,5)).set_xlabel('Reviews')

In [45]:
df["Label"] = df["Score"].apply(lambda score: "positive" if score >=3 else "negative")
df["Label"] = df["Label"].map({"positive":1, "negative":0})

In [46]:
df = df[["Text","Label"]]

In [47]:

df_label_0 = df[df['Label'] == 0].head(10000)
df_label_1 = df[df['Label'] == 1].head(10000)
df = pd.concat([df_label_0, df_label_1])
df['Text'] = df['Text'].apply(remove_stopwords)

In [48]:
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
import numpy as np
import nltk
import re
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 1 to 11783
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    20000 non-null  object
 1   Label   20000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 468.8+ KB


In [50]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [51]:
df

Unnamed: 0,Text,Label
1,product arrived labeled jumbo salted peanuts ....,0
3,looking secret ingredient robitussin believe f...,0
12,cats happily eating felidae platinum two years...,0
16,love eating good watching tv looking movies ! ...,0
26,"candy red , flavor . plan chewy . would never buy",0
...,...,...
11777,fuzzy prefers salmon primavera yellowfin tuna ...,1
11778,one products spoiled brats really appreciate ....,1
11780,good product . price amazon excellent really l...,1
11782,bite bullet order expensive fancy feast 3 cats...,1


In [52]:
count_label_0 = len(df[df['Label'] == 0])
count_label_1 = len(df[df['Label'] == 1])
print(count_label_0, count_label_1)

9071 9530


In [53]:
reviews = df["Text"].values.tolist()
labels = df["Label"].tolist()

In [54]:
from sklearn.model_selection import train_test_split

In [55]:
reviews_train, reviews_test, labels_train, labels_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

In [56]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
# tokenizer([reviews_train[2]], truncation=True, padding=True, max_length=128)

In [58]:
train_encodings = tokenizer(reviews_train, truncation=True, padding=True)
test_encodings = tokenizer(reviews_test, truncation=True, padding=True)

In [59]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    labels_train
)).shuffle(10000).batch(4)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    labels_test
)).batch(4)

In [60]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [61]:
model.fit(train_dataset, epochs=3, validation_data=test_dataset)

In [62]:
input_text = "This food is so delicious. I love it"
input_encoding = tokenizer(input_text, truncation=True, padding=True, return_tensors="tf")
predictions = model.predict(input_encoding)
predicted_label = tf.argmax(predictions.logits, axis=1).numpy()[0]
if predicted_label == 0:
    print("Negative")
else:
    print("Positive")