In [2]:
import spacy
import pandas as pd
import torch
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, Dataset
from torch import nn
from torchtext.vocab import FastText

In [5]:
# import torchtext.datasets as ds
# ds.AmazonReviewFull(root='.data', split=('train', 'test'))

100%|██████████| 644M/644M [00:15<00:00, 41.7MB/s]


(<torchtext.data.datasets_utils._RawTextIterableDataset at 0x216d682b0d0>,
 <torchtext.data.datasets_utils._RawTextIterableDataset at 0x216d7a182e0>)

### 1. Preprocessing

In [14]:
# df = pd.read_csv("train.csv", header=None)
# df.shape

(3000000, 3)

In [3]:
df = pd.read_csv('test.csv', header=None)
df.shape

(650000, 3)

In [4]:
df.rename({0:"star", 1:"rating1", 2:"rating2"}, axis=1, inplace=True)
df["review"] = df["rating1"] + " " +  df["rating2"]
df.drop(columns=["rating1", "rating2"], inplace=True)
df.star = df.star.apply(lambda x: int(x) -1)

In [5]:
df = df.dropna()

In [6]:
nlp = spacy.load("en_core_web_sm")
fasttext = FastText("simple")

In [7]:
def preprocessing(sentence):
    doc = nlp(sentence)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return tokens

def token_encoder(token, vec):
    if token == "<pad>":
        return 1
    else:
        try:
            return vec.stoi[token]
        except:
            return 0

def encoder(tokens, vec):
    return [token_encoder(token, vec) for token in tokens]

def padding(list_of_indexes, max_seq_len, padding_index=1):
    output = list_of_indexes + (max_seq_len - len(list_of_indexes))*[padding_index]
    return output[:max_seq_len]

In [8]:
# test_df = pd.read_csv('test.csv', header=None)

train_val_df = df.sample(frac=0.8, random_state=0)

train_df = train_val_df.sample(frac=.8, random_state=0)
val_df = train_val_df.drop(train_df.index)
test_df = df.drop(train_val_df.index)

train_df.shape, val_df.shape, test_df.shape

((415992, 2), (103998, 2), (129998, 2))

### 2. Dataloader

In [9]:
class LoadData(Dataset):
    def __init__(self, df, max_seq_len=32):
        self.max_seq_len = max_seq_len
        self.vec = FastText("simple")
        self.vec.vectors[1] = -torch.ones(self.vec.vectors[1].shape[0])
        self.vec.vectors[0] = torch.zeros(self.vec.vectors[0].shape[0])
        self.vectorizer = lambda x: self.vec.vectors[x]
        self.labels = df.star
        sequences = [padding(encoder(preprocessing(sequence), self.vec), max_seq_len) for sequence in df.review.tolist()]
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, i):
        assert len(self.sequences[i]) == self.max_seq_len
        return self.sequences[i], self.labels[i]

In [10]:
train_set = LoadData(train_df, max_seq_len=32)
val_set = LoadData(val_df, max_seq_len=32)
test_set = LoadData(test_df, max_seq_len=32)

In [None]:
def collate(batch, vectorizer):
    inputs = torch.stack([torch.stack([vectorizer(token) for token in sentence[0]]) for sentence in batch])
    target = torch.LongTensor([item[1] for item in batch])
    return inputs, target

# def collate2(batch, vectorizer=test_set.vectorizer):
#     inputs = torch.stack([torch.stack([vectorizer(token) for token in sentence[0]]) for sentence in batch])
#     target = torch.LongTensor([item[1] for item in batch])
#     return inputs, target

In [2]:
batch_size = 16
batch = batch_size * 32

trainloader = DataLoader(train_set, batch_size=batch_size, collate_fn=collate(batch, train_set.vectorizer))
validloader = DataLoader(val_set, batch_size=batch_size, collate_fn=collate(batch, val_set.vectorizer))
testloader = DataLoader(test_df, batch_size=batch_size, collate_fn=collate(batch, test_set.vectorizer))

### 3. Model

In [None]:
emb_dim = 300
class Classifier(nn.Module):
    def __init__(self, max_seq_len, emb_dim, hidden1=16, hidden2=16):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(max_seq_len*emb_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 5)
        self.out = nn.LogSoftmax(dim=1)
    
    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return self.out(x)

In [None]:
MAX_SEQ_LEN = 32
model = Classifier(MAX_SEQ_LEN, 300, 16, 16)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

### 4. Training Loop

In [None]:
epochs = 5

def validation(model, validloader, criterion):
    model.eval()
    loss_total = 0
    with torch.no_grad():
        for i, (inputs, labels) in enumerate(iter(validloader)):
            output = model(inputs)
            loss = criterion(output, labels)
            loss_total += loss.item()
    return loss_total / len(validloader)

the_last_loss = 100
patience = 2
trigger_times = 0

for epoch in range(epochs):

    model.train()
    for i, (inputs, labels) in enumerate(iter(trainloader)):
        
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            loss, current = loss.item(), i * len(inputs)
            print(f"loss: {loss:>7f}  [{current:>5d}/{len(trainloader.dataset):>5d}]")

    # early stopping
    the_current_loss = validation(model, validloader, criterion)
    print('The current loss:', the_current_loss)

    if the_current_loss > the_last_loss:
        trigger_times += 1
        print('Trigger times: ', trigger_times)

        if trigger_times >= patience:
            print('Early stopping!\nStart the test process.')
    else:
        print('Trigger times: 0')
        trigger_times = 0
    
    the_last_loss = the_current_loss
    total, correct = 0, 0

    model.eval()
    with torch.no_grad():
        for i, (inputs, labels) in enumerate(iter(testloader)):
            output = model(inputs)
            _, predicted = torch.max(output.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f"Accuracy: {correct/total}")
        