In [4]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import Tensor, IntTensor
import numpy as np
import torch
import matplotlib.pyplot as plt
import os

In [5]:
alphabet = [' ','a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v' ,'w', 'x', 'y', 'z', 
 '0', '1', '2','3', '4', '5', '6', '7','8','9','-', ';', '.', '!', '?', ':', '’', '\\', '|', '_', '@', '#', '$', '%', 'ˆ', '&', '*', 
 '˜', '‘', '+', '-', '=', '<', '>','(', ')', '[',']', '{', '}']

In [6]:
indexing = { letter : i+1 for i, letter in enumerate(alphabet)}
indexing['UNK'] = len(alphabet)
indexing['No_letter'] = 0

In [7]:
def dataGenerator(train_split=0.8,binary=False, max_length=1014): 
    
    dataset = []

    cwd = os.getcwd()
    for filename in ['pos', 'neg']:
        file_dir = join(cwd, 'aclImdb', 'train', filename)
        files = [f for f in listdir(file_dir) if isfile(join(file_dir, f))]
        for f in files: 
            name = f.split('.')[0]
            name = name.split('_')
            id = name[0]
            rating = name[1]
            if(binary): 
                if(filename=='pos'): 
                    rating=1
                else: 
                    rating=0
            
            path = join(file_dir, f)
            review = torch.zeros(max_length).long()
            with open(path) as myfile:
                data=myfile.read()
                for i in range(min(max_length,len(data))):
                    letter = data[i].lower()
                    if letter in alphabet:
                        review[i] = indexing[letter]
                    else:
                        review[i] = indexing['UNK']

                dataset.append({'review': review, 'rating': torch.IntTensor([int(rating) -  1])})
    
    #random split 0.8 / 0.2
    dataset_train, dataset_val =  train_test_split(dataset, test_size=1-train_split)
    
    return dataset_train, dataset_val

def data_iter(dataset, batch_size=32):   
    dataset_size = len(dataset)
    start = -1 * batch_size
    order = list(range(dataset_size))
    random.shuffle(order)

    while True:
        start += batch_size
        if start > dataset_size - batch_size:
            # Start another epoch.
            start = 0
            random.shuffle(order)   
        batch_indices = order[start:start + batch_size]
        yield [dataset[index] for index in batch_indices]

# This is the iterator we use when we're evaluating our model. 
# It gives a list of batches that you can then iterate through.
def eval_iter(source, batch_size):
    batches = []
    dataset_size = len(source)
    start = -1 * batch_size
    order = list(range(dataset_size))
    random.shuffle(order)

    while start < dataset_size - batch_size:
        start += batch_size
        batch_indices = order[start:start + batch_size]
        batch = [source[index] for index in batch_indices]
        if len(batch) == batch_size:
            batches.append(batch)
        else:
            continue
        
    return batches

# The following function gives batches of vectors and labels, 
# these are the inputs to your model and loss function
def get_batch(batch):
    vectors = []
    labels = []
    for dict in batch:
        vectors.append(dict["review"])
        labels.append(dict["rating"])
    return vectors, labels


In [8]:
class Char_CNN_Small(nn.Module):
    
    def __init__(self,
                 fully_layers,
                 l0,
                 alphabet_size,
                 nb_classes,
                 batch_size,
                 ):
        super(Char_CNN_Small,self).__init__()
        
        self.conv_layers = [
                    [256, 7, 3],
                    [256, 7, 3],
                    [256, 3, None],
                    [256, 3, None],
                    [256, 3, None],
                    [256, 3, 3]
                    ]
        self.fully_layers = fully_layers
        self.batch_size = batch_size
        self.nb_classes = nb_classes
        self.alphabet_size = alphabet_size
        self.l0 = l0
        
        self.convs = []
        self.linear = []
        self.max_pool = nn.MaxPool1d(3)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout()
        
        in_feat = alphabet_size
        for out_feat, kernel_size, max_pool in conv_layers:
            conv = nn.Conv1d(in_feat, out_feat, kernel_size)
            print(conv.weight.size())
            self.convs.append(conv)
            in_feat = out_feat
        
        l6 = int((l0 - 96)/27)
        in_feat = l6*out_feat
        
        for out_feat in fully_layers:
            self.linear.append(nn.Linear(in_feat, out_feat))
            in_feat = out_feat
        
        self.classifier = nn.Linear(in_feat, nb_classes)
        
        if self.nb_classes == 2:
            self.class_non_lin = nn.Sigmoid()
        else:
            self.class_non_lin = nn.Softmax()
        

    def forward(self, x):
        out = x
        for conv in self.convs[:2]:
            out = conv(out)
            out = self.relu(out)
            out = self.max_pool(out)
        
        for conv in self.convs[2:5]:
            out = conv(out)
            out = self.relu(out)
            
            
        out = self.convs[5](out)
        out = self.relu(out)
        out = self.max_pool(out)
        
        out = out.view(batch_size, -1)
        
        
        for lin in self.linear:
            out = lin(out)
            out = self.relu(out)
            out = self.dropout(out)
        
#        print(out.data.numpy().shape)
#        print(out.data.numpy()[:, :10])
        out = self.classifier(out)
        out = self.class_non_lin(out)

        return out
    
    def init_weights(self):
        for conv in self.convs:
            nn.init.normal(conv.weight, mean=0, std=0.02)

In [9]:
def evaluate(model, data_iter):
    model.eval()
    correct = 0
    total = 0
    for i in range(len(data_iter)):
        vectors, labels = get_batch(data_iter[i])
        
        vectors = torch.stack(vectors)
        vectors_ = torch.unsqueeze(vectors, 1)
        one_hot = torch.FloatTensor(batch_size, alphabet_size, l0).zero_()
        one_hot.scatter_(1, vectors_, 1)
        vectors = Variable(one_hot)

        labels = Variable(torch.stack(labels).squeeze())
        
        output = model(vectors)
#        print(vectors.data.numpy().sum(axis=2))
#        print(output)
        _, predicted = torch.max(output, 1)
        
        total += len(labels)
        correct += np.equal(predicted.data.numpy(), labels.data.numpy()).sum()
      
    return correct / float(total)

In [33]:
def training_loop(batch_size, num_epochs, model, loss_, optim, training_iter, validation_iter, train_eval_iter):
    step = 0
    epoch = 0
    #total_batches = int(len(training_set) / batch_size)
    total_batches = int(200 / batch_size)
    
    print("total_bat", total_batches)
    while epoch <= num_epochs:
        model.train()
        vectors, labels = get_batch(next(training_iter)) 

        vectors = torch.stack(vectors)
        vectors_ = torch.unsqueeze(vectors, 1)
        one_hot = torch.FloatTensor(batch_size, alphabet_size, l0).zero_()
        one_hot.scatter_(1, vectors_, 1)
        
        vectors = Variable(one_hot) # batch_size, seq_len
        
        labels = Variable(torch.stack(labels).squeeze())
        
        model.zero_grad()
        
        output = model(vectors)
        lossy = loss_(output, labels.long())
        lossy.backward()
#        torch.nn.utils.clip_grad_norm(model.parameters(), 5.0)
        optim.step()
        
        if step % total_batches == 0:
            if epoch % 5 == 0:
                print("begin print")
                print("Epoch %i; Step %i; Loss %f; Train acc: %f; Dev acc %f" 
                      %(epoch, step, lossy.data[0],\
                        evaluate(model, train_eval_iter),\
                        0))#evaluate(model, validation_iter)))
            epoch += 1
        step += 1

In [11]:
## generate dataset

training_set, validation_set = dataGenerator()


In [34]:
# Hyper Parameters 
conv_layers = [
                    [256, 7, 3],
                    [256, 7, 3],
                    [256, 3, None],
                    [256, 3, None],
                    [256, 3, None],
                    [256, 3, 3]
                    ]

fully_layers = [1024, 1024]
l0 = 1014
alphabet_size = 68
nb_classes = 10
batch_size = 8

learning_rate = 0.008
num_epochs = 20


# Build, initialize model
model = Char_CNN_Small(fully_layers, l0, alphabet_size, nb_classes, batch_size)
model.init_weights()

# Loss and Optimizer

loss = nn.CrossEntropyLoss()  
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Train the model
training_iter = data_iter(training_set[:8], batch_size)
train_eval_iter = eval_iter(training_set[:8], batch_size)
validation_iter = eval_iter(validation_set[:8], batch_size)

training_loop(batch_size, num_epochs, model, loss, optimizer, training_iter, validation_iter, train_eval_iter)

torch.Size([256, 68, 7])
torch.Size([256, 256, 7])
torch.Size([256, 256, 3])
torch.Size([256, 256, 3])
torch.Size([256, 256, 3])
torch.Size([256, 256, 3])
total_bat 25
8112
begin print
Epoch 0; Step 0; Loss 2.302569; Train acc: 0.125000; Dev acc 0.000000
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112
8112


KeyboardInterrupt: 

In [74]:
(training_set[0]['review'].numpy() == 0).sum()

0