In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.distributed import DistributedSampler
from torchvision import transforms
import torch.utils.data
import torch.nn.functional as F
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import string
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split


KeyboardInterrupt: 

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mr.J\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mr.J\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mr.J\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

In [None]:
RANDOM_SEED  = 123
torch.manual_seed(RANDOM_SEED)

max_sequence    = 200

In [None]:
# df = pd.read_csv('dataset/IMDB Dataset.csv')
df = pd.read_csv('dataset/AMAZON_FASHION_5.csv')
df = df.dropna(subset=['reviewText'])
# df = df.sample(frac=1, random_state=1)

def remove_punctuation(text):
    if isinstance(text, float):
        print(text)
    return text.translate(str.maketrans('', '', string.punctuation)).lower()
def remove_stop_words(tokens):
    return [token for token in tokens if token not in stop_words]
def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style/Size:,style/Color:,reviewerName,reviewText,summary,unixReviewTime,style/Size Name:,style/Style:,vote,image/0,image/1,image/2
0,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,Big Boys,Blue/Orange,Tonya B.,Great product and price!,Five Stars,1441324800,,,,,,
1,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,Big Boys,Black (37467610) / Red/White,Tonya B.,Great product and price!,Five Stars,1441324800,,,,,,
2,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,Big Boys,Blue/Gray Logo,Tonya B.,Great product and price!,Five Stars,1441324800,,,,,,
3,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,Big Boys,Blue (37867638-99) / Yellow,Tonya B.,Great product and price!,Five Stars,1441324800,,,,,,
4,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,Big Boys,Blue/Pink,Tonya B.,Great product and price!,Five Stars,1441324800,,,,,,


In [None]:
review = df['reviewText'].apply(remove_punctuation).apply(word_tokenize).apply(remove_stop_words).apply(lemmatize_words)

In [None]:
def map_sentiment(sentiment):
    return sentiment-1

# def map_sentiment(sentiment):
#     if sentiment == 'positive':
#         return 1
#     return 0

# Apply the mapping function to the 'sentiment' column
sentiment = df['overall'].apply(map_sentiment)
# sentiment = df['sentiment'].replace({"positive":[1,0], "negative":[0,1]})

In [None]:
X = review.to_list()
y = sentiment[review.index].to_numpy()
print(len(X))
print(len(y))

3160
3160


In [None]:
all_words = [token for phrase in X  for token in phrase ]
vocab = Counter(all_words)
vocab = sorted(vocab, key=vocab.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
encoded_review = [[vocab_to_int[token] for token in phrase] for phrase in X]



features = np.zeros((len(encoded_review), max_sequence), dtype=np.int32)
for i, e in enumerate(encoded_review):
    e_len = len(e)
    if e_len <= max_sequence:
        zeros   = list(np.zeros(max_sequence-e_len))
        new     = zeros + e
    else:
        new     = e[:max_sequence]
    features[i,:] = np.array(new)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, y, test_size=.2, random_state=42)

In [None]:
class imdbdataset(Dataset):
    def __init__(self, x, y, transform=None, exclude_type=None) -> None:
        super(imdbdataset, self).__init__()
        if exclude_type=="train":
            mask = np.isin(y, list(set([1,3])), invert=True)
            self.x = x[mask]
            y = y[mask]
            y = np.where(y==0, 0, y)
            y = np.where(y==2, 2, y)
            y = np.where(y==4, 3, y)
            self.y = y
        else:
            self.x = x
            self.y = y
        self.transform = transform

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        sample = self.x[index], self.y[index]
        if self.transform:
            sample = self.transform(sample)
        return sample
    
    # def get_labels(self):   return self.y

class ToTensor:
    def __call__(self, sample):
        x, y = sample
        x = np.array(x) if not isinstance(x, np.ndarray) else x
        y = np.array(y) if not isinstance(y, np.ndarray) else y
        return torch.from_numpy(x), torch.from_numpy(y).long()

compose = transforms.Compose([
    ToTensor(),
])

train_data_set = imdbdataset(x_train, y_train, transform=compose, exclude_type='train')
test_data_set  = imdbdataset(x_test,  y_test,  transform=compose, exclude_type='train')


In [None]:
train_loader = DataLoader(train_data_set, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_data_set,  batch_size=16, shuffle=False)

In [None]:
example = iter(train_loader)
feature, label = next(example)
print(features, label)

[[  0   0   0 ...   6 108 106]
 [  0   0   0 ...   6 108 106]
 [  0   0   0 ...   6 108 106]
 ...
 [  0   0   0 ...   4   3  14]
 [  0   0   0 ...  16  87  57]
 [  0   0   0 ...  35  41 155]] tensor([3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 0, 3])


In [None]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, rnn_type='simple'):
        super(RNN, self).__init__()
        self.embedding  = nn.Embedding(input_dim, embedding_dim)

        self.rnn_type = rnn_type

        if rnn_type == 'gru':
            self.rnn        = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        elif rnn_type == 'lstm':
            self.rnn        = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        else:
            self.rnn        = nn.RNN(embedding_dim, hidden_dim, batch_first=True)

        self.fc         = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, tt):
        embedded = self.embedding(tt)

        out, h = self.rnn(embedded)

        if self.rnn_type == 'lstm':
            h = h[0]

        output = self.fc(h.squeeze_(0))
        return output

learning_rate   = .001
num_epochs      = 3
embedding_dim   = 100
hidden_dim      = 128
num_classes     = 3


model = RNN(input_dim=len(vocab_to_int)+1, embedding_dim=embedding_dim, 
            hidden_dim=hidden_dim, output_dim=num_classes, rnn_type='simple').to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
def test(class_test=False):    
    test_loss   = 0.0
    total       = 0
    correct     = 0
    n_class_correct = [0 for i in range(num_classes)]
    n_class_samples = [0 for i in range(num_classes)]
    with torch.no_grad():
        for text, label in test_loader:
            text, label = text.to(device), label.to(device)
            output = model(text)
            loss = criterion(output, label)
            test_loss += loss.item() * text.size(0)

            _, predicted = torch.max(output, 1)
            total += label.size(0)
            correct += (predicted == label).sum().item()
            for i in range(len(label)):
                label_ = label[i]
                pred  = predicted[i]
                if (label_ == pred):
                    n_class_correct[label_] += 1
                n_class_samples[label_] += 1
    
    test_loss /= len(test_loader.dataset)
    accuracy = correct / total
    print(f" Test Loss: {test_loss}, Test Accuracy: {accuracy}")
    if class_test:
        for i in range(num_classes):
            acc = 100 * n_class_correct[i] / n_class_samples[i]
            print(f'Accuracy of {i}:{acc}')

In [None]:
for epoch in range(num_epochs):
    train_loss = 0.0
    for text, label in train_loader:
        text, label = text.to(device), label.to(device)

        output = model(text)

        l = criterion(output, label)

        optimizer.zero_grad()

        l.backward()

        optimizer.step()    

        train_loss += l.item()

    train_loss /= len(train_loader.dataset)
    print(f"epoch {epoch+1}/{num_epochs} loss: {train_loss}", end='')
    test()
test(class_test=True)

RuntimeError: unique_by_key: failed to synchronize: cudaErrorInvalidValue: invalid argument