<a href="https://colab.research.google.com/github/cwgough/ml-practice/blob/main/DNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import gzip
import json

In [2]:
# load and filter data

def load_data(file_name, head = None):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)

            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return data

In [3]:
PATH = '/content/drive/MyDrive/goodreads_reviews_spoiler_raw.json.gz'

data = load_data(PATH)
df = pd.DataFrame(data)
df.drop(['user_id', 'book_id', 'review_id', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments'], axis=1, inplace=True)

for i in range(6):
  print(f"Number of {i} star reviews: {len(df[df['rating']==i])}")

Number of 0 star reviews: 47052
Number of 1 star reviews: 44752
Number of 2 star reviews: 112226
Number of 3 star reviews: 290430
Number of 4 star reviews: 480199
Number of 5 star reviews: 403374


In [4]:
# remove reviews with values 0 (invalid) or 3 (neutral)
df.drop(df[(df['rating']==3) | (df['rating']==0)].index, inplace=True)
df.head(10)

Unnamed: 0,rating,review_text
0,5,This is a special book. It started slow for ab...
4,4,"I really enjoyed this book, and there is a lot..."
5,4,A beautiful story. It is rare to encounter a b...
6,5,5 stars for giving me a better framework for h...
8,4,Another hard to put down nonfiction book from ...
9,5,I love Stephenson - and this was another hit -...
10,5,A beautiful story. Neil Gaiman is truly a uniq...
11,5,I couldn't put this book down. It was well wri...
12,4,"What a fun series. I loved Wool, and Dust and ..."
14,4,"A classic dystopian novel published in 1985, b..."


In [5]:
# split into negative and positive reviews
neg_reviews = df[(df['rating']==1) | (df['rating']==2)]
pos_reviews = df[(df['rating']==4) | (df['rating']==5)]

# shuffle reviews
neg_reviews = neg_reviews.sample(frac=1).reset_index(drop=True)
pos_reviews = pos_reviews.sample(frac=1).reset_index(drop=True)

# decrease size of dataset (to match technical interview conditions)
neg_reviews = neg_reviews.head(1500)
pos_reviews = pos_reviews.head(1500)

# recombine and shuffle
new_df = pd.concat([neg_reviews, pos_reviews])
new_df = new_df.sample(frac=1).reset_index(drop=True)

In [6]:
def binarize_rating(rating):
  return 1 if rating > 3 else 0

In [7]:
new_df['rating'] = new_df['rating'].apply(binarize_rating)
new_df.head(10)

Unnamed: 0,rating,review_text
0,1,"Just an okay read for me. Didn't hate, but did..."
1,0,** spoiler alert ** \n Talk about a Debbie Dow...
2,0,** spoiler alert ** \n Hi. I'm going to warn y...
3,1,Loved it! Loved loved loved it. The writing is...
4,1,This very well could become a classic in the v...
5,1,Pucked is the most perfect mix of sexy and fun...
6,0,The movie is way better than the book!
7,1,"""if you want a picture of the future, imagine ..."
8,0,I honestly thought I was going to enjoy this o...
9,0,The book was easy to read and quick to get int...


In [8]:
"""BEGIN TECHNICAL INTERVIEW"""
# Task one: preprocessing

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [53]:
stop_words = set(stopwords.words('english'))
wnl = WordNetLemmatizer()

def preprocess_text(text):
  text = word_tokenize(text)
  text = [word.lower() for word in text if word not in stop_words]  # lowercase and remove stopwords
  text = [word for word in text if len(word) > 2]  # remove short words
  text = [re.sub('[^a-zA-Z+]', '', word) for word in text]  # remove non-letter chars except whitespaces
  text = [wnl.lemmatize(word) for word in text]
  # truncate text length?
  text = ' '.join(text)
  return text

In [54]:
new_df['review_text'] = new_df['review_text'].apply(preprocess_text)
new_df.head()

Unnamed: 0,rating,review_text
0,1,okay read hate fall love character particularl...
1,0,spoiler alert talk debbie downer kind book sur...
2,0,spoiler alert going warn right going pleasant ...
3,1,loved loved loved loved writing fabulous loved
4,1,well could become classic vein douglas adam hi...


In [55]:
# Separate data
import sklearn
from sklearn.model_selection import train_test_split

x_train, x_valid_test, y_train, y_valid_test = train_test_split(new_df['review_text'], new_df['rating'], test_size=0.3, random_state=42)
x_valid, x_test, y_valid, y_test = train_test_split(x_valid_test, y_valid_test, test_size=0.611, shuffle=False)
print(x_train.shape, y_train.shape, x_valid.shape, y_valid.shape, x_test.shape, y_test.shape)

(2100,) (2100,) (350,) (350,) (550,) (550,)


In [56]:
# Vectorize text
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(x_train).toarray()  # create tf-idf embeddings based on training set
x_valid_tfidf = vectorizer.transform(x_valid).toarray()  # all other data is transformed using the learned embedding
x_test_tfidf = vectorizer.transform(x_test).toarray()
vocab_size = len(vectorizer.vocabulary_)
print(vocab_size, x_train_tfidf.shape)

17399 (2100, 17399)


In [57]:
# testing vectorizer worked

first_vector = x_train_tfidf[0]
sample_df = pd.DataFrame(first_vector.T, index=vectorizer.get_feature_names_out(), columns=["tfidf"])
sample_df.sort_values(by=["tfidf"],ascending=False, inplace=True)
sample_df.head(10)

Unnamed: 0,tfidf
actual,0.68194
rating,0.603843
star,0.412713
pouring,0.0
poured,0.0
pour,0.0
pound,0.0
pottermore,0.0
potterish,0.0
potterheads,0.0


In [58]:
#create Tensor Dataset
import torch
from torch.utils.data import DataLoader, TensorDataset

# convert from Series to np_array so TensorDataset can read it
y_train, y_valid, y_test = np.array(y_train), np.array(y_valid), np.array(y_test)

train_data=TensorDataset(torch.FloatTensor(x_train_tfidf), torch.LongTensor(y_train))
valid_data = TensorDataset(torch.FloatTensor(x_valid_tfidf), torch.LongTensor(y_valid))
test_data=TensorDataset(torch.FloatTensor(x_test_tfidf), torch.LongTensor(y_test))

#dataloader
batch_size=50
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=True)
test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [59]:
# sample from dataloader
train_features, train_labels = next(iter(train_loader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

review = train_features[0]
label = train_labels[0]

print(f"Label: {label}\nReview: {review}")

Feature batch shape: torch.Size([50, 17399])
Labels batch shape: torch.Size([50])
Label: 1
Review: tensor([0., 0., 0.,  ..., 0., 0., 0.])


In [60]:
import torch.nn as nn

class SentimentalLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """
    def __init__(self, input_size, output_size, hidden_dim, n_layers, drop_prob=0.1):
        """
        Initialize the model by setting up the layers
        """
        super().__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        #LSTM layers
        self.lstm = nn.LSTM(input_size, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)

        #dropout layer
        self.dropout = nn.Dropout(0.3)

        #Linear and activation layers
        self.fc1 = nn.Linear(hidden_dim, 64)
        self.fc2 = nn.Linear(64, 16)
        self.fc3 = nn.Linear(16,output_size)
        # self.tanh = nn.Tanh()
        # self.sigmoid = nn.Sigmoid()
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size()

        #LSTM output
        lstm_out, hidden = self.lstm(x.unsqueeze(1), hidden)  # x = (N, L, H_in); note L==1

        #stack up the lstm output
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        #dropout and fully connected layers
        out = self.dropout(lstm_out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.dropout(out)
        out = self.fc3(out)
        # out = self.tanh(out)
        # out = self.sigmoid(out)
        # out = self.softmax(out)

        return out, hidden

    def init_hidden(self, batch_size, train_on_gpu):
        """Initialize Hidden STATE"""
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

        return hidden

In [67]:
# Instantiate the model and define hyperparameters
input_size = vocab_size
output_size = 2  # for multi-class w/ two classes
# output_size = 1
hidden_dim = 256
n_layers = 2
lr = 0.001
epochs = 3  # best performance using validation set
clip = 5 # gradient clipping- what does this do

net = SentimentalLSTM(input_size, output_size, hidden_dim, n_layers)
print(net)

SentimentalLSTM(
  (lstm): LSTM(17399, 256, num_layers=2, batch_first=True, dropout=0.1)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=256, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=2, bias=True)
)


In [68]:
# loss and optimization functions
criterion = nn.CrossEntropyLoss(reduction='mean')
# criterion = nn.SoftMarginLoss(reduction='mean')
# criterion = nn.BCELoss(reduction='mean')
optimizer = torch.optim.Adam(net.parameters(), lr=lr)  # research optimizers

# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

for e in range(epochs):
    net.train()  # set model to training mode
    total_training_loss = 0.0

    # initialize hidden state
    h = net.init_hidden(batch_size, train_on_gpu)

    # batch loop
    for inputs, labels in train_loader:
        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history- WHY
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        optimizer.zero_grad()

        # forward pass through model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels)
        total_training_loss += loss.item()  # loss.item() is the mean across batch
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)

        optimizer.step()

    # Get validation loss- once per epoch
    val_h = net.init_hidden(batch_size, train_on_gpu)
    total_val_loss = 0.0
    net.eval()  # set model to evaluation mode
    for inputs, labels in valid_loader:
        val_h = tuple([each.data for each in val_h])

        if (train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        output, val_h = net(inputs, val_h)
        val_loss = criterion(output.squeeze(), labels)
        total_val_loss += val_loss.item()

    avg_training_loss = total_training_loss/len(train_loader)
    avg_val_loss = total_val_loss/len(valid_loader)
    print("Epoch: {}/{} | ".format(e+1, epochs),
          "Training loss: {:.6f} | ".format(avg_training_loss),
          "Val loss: {:.6f}".format(avg_val_loss))

Epoch: 1/3 |  Training loss: 0.694099 |  Val loss: 0.690748
Epoch: 2/3 |  Training loss: 0.565669 |  Val loss: 0.665804
Epoch: 3/3 |  Training loss: 0.228533 |  Val loss: 0.505431


In [69]:
total_test_loss = 0.0
num_correct = 0

h = net.init_hidden(batch_size, train_on_gpu)

net.eval()

for inputs, labels in test_loader:

    h = tuple([each.data for each in h])

    if (train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()

    output, h = net(inputs, h)

    # calculate loss
    test_loss = criterion(output.squeeze(), labels)
    total_test_loss += test_loss.item()

    # convert output probabilities to predicted class (0.0 or 1.0)
    # y_pred = torch.round(output.squeeze())  # 0/1 labels
    # y_pred = torch.sign(output.squeeze())  # -1/1 labels
    y_pred = torch.argmax(output, dim=1).squeeze()  # categorical labels

    # compare predictions to true label
    correct_tensor = y_pred.eq(labels)
    correct = correct_tensor.numpy() if not train_on_gpu else correct_tensor.cpu().numpy()
    num_correct += np.sum(correct)


# -- stats! -- ##
avg_test_loss = total_test_loss/len(test_loader)
print("Average test loss: {:.3f}".format(avg_test_loss))

# accuracy over all test data
test_acc = num_correct/len(test_data)
print("Test accuracy: {:.3f}".format(test_acc))

Average test loss: 0.516
Test accuracy: 0.796
