<a href="https://colab.research.google.com/github/danilobml/pytorch-networks/blob/main/basic_text_classifier_neural_network_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Sentiment analysis and classification NN.

# Text pre-processing (using nltk)
# import
import nltk

# Download its libraries (here all of them)
nltk.download('all')

In [None]:
# Load the file from gihthub repo, using numpy and pandas and put it into a dataframe:
import numpy as np
import pandas as pd

dataset = pd.read_csv('https://raw.githubusercontent.com/sharmaroshan/Restaurant-Reviews-Analysis/refs/heads/master/Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
dataset.head()

In [40]:
# Preprocessing task imports:

# To remove stopwords (recurring words that add no meaning) using nltk:
from nltk.corpus import stopwords
# To use stemming - derive the root form of words (ex: running and runner -> run)
from nltk.stem.porter import PorterStemmer
# Python regex to remove commas, points and other symbols:
import re

ps = PorterStemmer()

In [None]:
# To check the dataset:
dataset.info()

In [39]:
# Loop through all words, removing stopwords and using stemming,
# to create a corpus of clean text:

corpus = []

for i in range(0, 1000):
  # symbol replacing of each review
  customer_review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  # lower case and separation of words by space:
  customer_review = customer_review.lower()
  customer_review = customer_review.split()
  # Clan review - apply stemming and remove stopwords:
  clean_review = [ps.stem(word) for word in customer_review if not word in set(stopwords.words('english'))]
  clean_review = ' '.join(clean_review)
  corpus.append(clean_review)

In [15]:
# Convert the sentences to numeric format using TFIDF vectorizer from scikitlearn:

from sklearn.feature_extraction.text import TfidfVectorizer
# Specify how many words (max_features) you want and the number of times they
# have to be repeated (min_df) to count, max_df removes words that are too frequent
# 0.6 means remove if it appears in 60% or more of all texts:
vectorizer = TfidfVectorizer(max_features=1500, min_df=3, max_df=0.6)
# Convert corpus to a numeric array:
X = vectorizer.fit_transform(corpus).toarray()

In [None]:
# check sample:
X[0]

# For relevant words, you will get a value non-zero (TF-IDF):

# TF (Term Frequency) -> repetition of the words/total words in document

# IDF (Inverse Document Frequency) -> tests how relevant the word is:

# df(t) = N(t)
# where
# df(t) = Document frequency of a term t
# N(t) = Number of documents containing the term t
# N = Total number of documents
# IDF(t) = log(N/df(t))

In [27]:
# Create a dependent variable y, which will carry the labels. For that, get
# all the rows in the second column, convert to a numpy array:

y = dataset.iloc[:, 1].values

In [31]:
# Split dataset in training (80%) and test (20%) sets, using sklearn:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [42]:
# Build the pytorch Neural Network:
# imports:

import torch
import torch.nn as nn
from torch.nn import functional as F

In [33]:
# Convert training and test data to (float) Tensors:
Xtrain_ = torch.from_numpy(X_train).float()
Xtest_ = torch.from_numpy(X_test).float()

ytrain_ = torch.from_numpy(y_train)
ytest_ = torch.from_numpy(y_test)

In [36]:
# to check tensors shape:
Xtrain_.shape, y_train.shape

(torch.Size([800, 467]), (800,))

In [38]:
# The number to the right of the total is the total of vectrized word.
# It will be the input_size of the NN

input_size = 467

# Output size is two (sentiment positive or negative)
output_size = 2

# Hidden size can be different numbers to be tried:
hidden_size = 500

In [43]:
# Define Neural Network class with two hidden layers:
class NeuralNetwork(nn.Module):
  def __init__(self):
    super(NeuralNetwork, self).__init__()
    # define layers, using Linear:
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, hidden_size)
    self.fc3 = nn.Linear(hidden_size, output_size)

  # define the forward method -> calls the layers, passing them through ReLU
  # (rectified linear unit function -> the most commonly used activation function
  # in deep learning models. The function returns 0 if it receives any negative
  # input, but for any positive value x it returns that value back):
  def forward(self, X):
    X = torch.relu((self.fc1(X)))
    X = torch.relu((self.fc2(X)))
    X = self.fc3(X)
    # run through log_softmax -> function that computes the output and gradient:
    return F.log_softmax(X, dim=1)

# Instantiate model:
model = NeuralNetwork()

In [46]:
# Define optimizer and loss function:
import torch.optim as optim

# For optimizer, we use Adam, with model-parameters and learn rate:
optimizer = optim.Adam(model.parameters(), lr=0.01)

# For loss function NLLLoss -> negative log likelihood loss.
# It is useful to train a classification problems with an
# unbalanced training set:
loss_fn = nn.NLLLoss()

In [48]:
# Define epochs and train Neural Network:
epochs = 100

for epoch in range(epochs):
  # first call optimizer with zero_grad -> Resets the gradients of optimized tensors:
  optimizer.zero_grad()
  # define predictions:
  Ypred = model(Xtrain_)
  # calculate loss -> deviation from expected values (labels):
  loss = loss_fn(Ypred, ytrain_)
  # Backpropagation -> calculates and stores the gradients for each model parameter
  loss.backward()
  # Optimization (parameter update)
  optimizer.step()

  if epoch % 10 == 0:
    print(f"Epoch {epoch} - loss: {loss.item()}")

Epoch 0 - loss: 0.6933143734931946
Epoch 10 - loss: 0.047478772699832916
Epoch 20 - loss: 0.03230816125869751
Epoch 30 - loss: 0.029457466676831245
Epoch 40 - loss: 0.028391627594828606
Epoch 50 - loss: 0.028234241530299187
Epoch 60 - loss: 0.028121326118707657
Epoch 70 - loss: 0.028086772188544273
Epoch 80 - loss: 0.02810979075729847
Epoch 90 - loss: 0.02864963747560978


In [47]:
# Save the model for future use:
# TODO - later

In [50]:
# Use the model:

# write a text with an evaluation inside a list:
sample = ['The fish was really good!']
# Transforme it into a numeric array:
sample = vectorizer.transform(sample).toarray()
# Convert to Tensor:
sample = torch.from_numpy(sample).float()

# Apply model to predict -> if the first element in the tensor is higher than
# the first, it's positive:
sentiment = model(sample)
sentiment

tensor([[-22.1356,   0.0000]], grad_fn=<LogSoftmaxBackward0>)

In [52]:
# Or, to simplifiy reading:
if sentiment[0][1] > sentiment[0][0]:
  print('positive')
else:
  print('negative')

positive


In [53]:
# A second run, now with text not related to restaurant reviews:

sample2 = ['Trump is a bad politician.']

sample2 = vectorizer.transform(sample2).toarray()
sample2 = torch.from_numpy(sample2).float()

sentiment2 = model(sample2)

if sentiment2[0][1] > sentiment2[0][0]:
  print('positive')
else:
  print('negative')

negative
