In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
import nltk
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_csv("./Reviews.csv")
df = df.sample(n=20000, random_state=42)
df['Text'] = df['Text'].str.replace('[^\w\s]', '').str.lower()
# df['Text'] = df['Text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))

In [4]:
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [5]:
df['Text'] = df['Text'].apply(remove_stopwords)

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import nltk

In [7]:
# df['Score'].value_counts().sort_index().plot(kind='bar', title='Fine Food Reviews', figsize=(10,5)).set_xlabel('Reviews')

In [8]:
df["Label"] = df["Score"].apply(lambda score: "positive" if score >=3 else "negative")
df["Label"] = df["Label"].map({"positive":1, "negative":0})

In [9]:
df[["Text","Label"]]

Unnamed: 0,Text,Label
165256,tried couple brands gluten-free sandwich cooki...,1
231465,"cat loves treats . ever ca n't find house , po...",1
427827,little less expected . tends muddy taste - exp...,1
433954,"first frosted mini-wheats , original size , fr...",0
70260,want congratulate graphic artist putting entir...,1
...,...,...
253447,bought book dr ornish well book dr esselstyn ....,1
437955,'s lot premium fruit nut stuff . mostly peanut...,1
505289,digestive problems tried anything modern medic...,1
548324,zesty brings zest popcorn . plain popcorn dull...,1


In [10]:
df["Label"] = df["Score"].apply(lambda score: "positive" if score >=3 else "negative")
df["Label"] = df["Label"].map({"positive":1, "negative":0})

In [11]:
df = df[["Text","Label"]]

In [12]:
df

Unnamed: 0,Text,Label
165256,tried couple brands gluten-free sandwich cooki...,1
231465,"cat loves treats . ever ca n't find house , po...",1
427827,little less expected . tends muddy taste - exp...,1
433954,"first frosted mini-wheats , original size , fr...",0
70260,want congratulate graphic artist putting entir...,1
...,...,...
253447,bought book dr ornish well book dr esselstyn ....,1
437955,'s lot premium fruit nut stuff . mostly peanut...,1
505289,digestive problems tried anything modern medic...,1
548324,zesty brings zest popcorn . plain popcorn dull...,1


In [13]:
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
import numpy as np
import nltk
import re
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 165256 to 393524
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    20000 non-null  object
 1   Label   20000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 468.8+ KB


In [15]:
df.isnull().sum()

Text     0
Label    0
dtype: int64

In [16]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [17]:
df

Unnamed: 0,Text,Label
165256,tried couple brands gluten-free sandwich cooki...,1
231465,"cat loves treats . ever ca n't find house , po...",1
427827,little less expected . tends muddy taste - exp...,1
433954,"first frosted mini-wheats , original size , fr...",0
70260,want congratulate graphic artist putting entir...,1
...,...,...
253447,bought book dr ornish well book dr esselstyn ....,1
437955,'s lot premium fruit nut stuff . mostly peanut...,1
505289,digestive problems tried anything modern medic...,1
548324,zesty brings zest popcorn . plain popcorn dull...,1


In [18]:
reviews = df["Text"].values.tolist()
labels = df["Label"].tolist()

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
reviews_train, reviews_test, labels_train, labels_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

In [21]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# tokenizer([reviews_train[2]], truncation=True, padding=True, max_length=128)

In [23]:
train_encodings = tokenizer(reviews_train, truncation=True, padding=True)
test_encodings = tokenizer(reviews_test, truncation=True, padding=True)

In [24]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    labels_train
)).shuffle(10000).batch(4)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    labels_test
)).batch(4)

In [25]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [26]:
model.fit(train_dataset, epochs=3, validation_data=test_dataset)

Epoch 1/3


: 

: 