In [1]:
import random
import numpy as np
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

from model import CharacterCNN
from model import weights_init
from model import DatasetReader

# Set random seem for reproducibility
manualSeed = 7
#manualSeed = random.randint(1, 10000) # use if you want new results
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
th.manual_seed(manualSeed)

Random Seed:  7


<torch._C.Generator at 0x2abadd2974b0>

In [2]:
#Character vocab
char_inventory = 'abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:\'"/\\|_@#$%ˆ&*˜`+-=<>()[]{}\n'

vocab = {c:i for i,c in enumerate(char_inventory)}

# One hot embedding size
nchars = len(char_inventory)

#Input length
input_length = 1014

# Decide which device we want to run on
ngpu = 1
device = th.device("cuda:0" if (th.cuda.is_available() and ngpu > 0) else "cpu")

#Batch size
batch_size = 128

#Number of threads for the data loader
workers = 2

#Number of epochs
nepochs = 20

In [3]:
cnn = CharacterCNN(nchars).to(device)
# Apply the weights_init function to randomly initialize all weights
cnn.apply(weights_init)
cnn.train()
# Print the model
print(cnn)

CharacterCNN(
  (net): Sequential(
    (0): Conv1d(70, 256, kernel_size=(7,), stride=(1,))
    (1): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (2): ReLU()
    (3): Conv1d(256, 256, kernel_size=(7,), stride=(1,))
    (4): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (5): ReLU()
    (6): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (7): ReLU()
    (8): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (9): ReLU()
    (10): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (11): ReLU()
    (12): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (13): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (14): ReLU()
    (15): Flatten()
    (16): Linear(in_features=8704, out_features=1024, bias=True)
    (17): ReLU()
    (18): Dropout(p=0.5)
    (19): Linear(in_features=1024, out_features=1024, bias=True)
    (20): ReLU()
    (21): Dropout(p=0.5)
    (22): Linear(in_features=1024, ou

In [4]:
train_data = DatasetReader('./data/yelp_review_full_csv/train.csv', vocab, input_length, nchars)
dataloader = th.utils.data.DataLoader(train_data, batch_size=batch_size,
                                         shuffle=True, num_workers=workers)



In [5]:
# The objective
criterion = nn.CrossEntropyLoss()

#Optimizer
optimizer = optim.Adam(cnn.parameters())

In [None]:
losses = []
lr = 0.01

#Main training loop
for epoch in range(0, nepochs):
        
    for i, batch in enumerate(dataloader):
        inputs = batch[0].type(th.FloatTensor).to(device)
        targets = batch[1].type(th.LongTensor).to(device)
        
        optimizer.zero_grad()
        preds = cnn(inputs)
        loss = criterion(preds, targets)
        loss.backward()
        optimizer.step()
        
        # Output training stats
        if i % 5 == 0:
            print('[%d/%d][%d/%d]\tLoss: %.4f'
                  % (epoch, nepochs, i, len(dataloader), loss.item()))
            losses.append(loss)
        if i % 100 == 0:
            th.save({
                'model': cnn.state_dict(),
                'epoch': epoch,
            }, './output/cnn.pth')
th.save({
    'model': cnn.state_dict(),
    'epoch': epoch,
}, './output/cnn.pth')

# Eval

In [4]:
test_data = DatasetReader('./data/yelp_review_full_csv/test.csv', vocab, input_length, nchars)
dataloader = th.utils.data.DataLoader(test_data, batch_size=batch_size, num_workers=workers)


In [5]:
cnn.load_state_dict(th.load('/home/dbw003/qsub_jobs/cnn-text-classification/cnn.pth')['model'])
cnn.eval()

CharacterCNN(
  (net): Sequential(
    (0): Conv1d(70, 256, kernel_size=(7,), stride=(1,))
    (1): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (2): ReLU()
    (3): Conv1d(256, 256, kernel_size=(7,), stride=(1,))
    (4): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (5): ReLU()
    (6): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (7): ReLU()
    (8): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (9): ReLU()
    (10): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (11): ReLU()
    (12): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (13): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (14): ReLU()
    (15): Flatten()
    (16): Linear(in_features=8704, out_features=1024, bias=True)
    (17): ReLU()
    (18): Dropout(p=0.5)
    (19): Linear(in_features=1024, out_features=1024, bias=True)
    (20): ReLU()
    (21): Dropout(p=0.5)
    (22): Linear(in_features=1024, ou

In [6]:
preds = []
gt = []
for batch in tqdm(dataloader):
    inputs = batch[0].type(th.FloatTensor).to(device)
    targets = batch[1].type(th.LongTensor).to(device)
        
    out = cnn(inputs)
    preds.extend(list(np.argmax(out.cpu().data.numpy(), axis=1)))
    gt.extend(list(targets.squeeze().cpu().data.numpy()))

100%|██████████| 391/391 [02:45<00:00,  2.36it/s]


In [7]:
err_rate = sum([preds[i] != gt[i] for i in range(len(gt))] ) / len(gt)
print("Error rate: %.4f"%err_rate)

Error rate: 0.3967
