In [2]:
import numpy as np
import scipy.stats
import scipy.integrate
from matplotlib import pyplot as plt
import torch
from torch.autograd import Variable
from utils import load_imdb
from utils import bag_of_words
from utilsCPU import load_imdbCPU
from utilsCPU import bag_of_wordsCPU


# import jtplot
from jupyterthemes import jtplot

# you can select an alternative theme's plot style by name
# oceans16 | grade3 | chesterish | onedork | monokai | solarizedl
jtplot.style('onedork')

# set "context" (paper, notebook, talk, or poster)
# & font scale (scalar applied to labels, legend, etc.)
jtplot.style('grade3', context='paper', fscale=1.4)

# turn on X- and Y-axis tick marks (default=False)
# and turn off the axis grid lines (default=True)
jtplot.style(ticks=True, grid=False)

# set the default figure size
# x (length), y (height)
jtplot.figsize(x=6., y=5.)

# or just adjust the aspect ratio
# new_length = length * aspect
jtplot.figsize(aspect=1.2)

### Problem 1b

In [None]:
def chi_density(x, D):
    return 2*x*scipy.stats.chi2.pdf(x**2, D)

x = np.linspace(0.1, 14, 1000)
D_list = [1, 5, 10, 20, 40, 60, 80, 100]
for i in D_list:
    plt.plot(x, chi_density(x, i))
    plt.xlim([0, 14])
    plt.ylim([0, 0.8])
    plt.title('Chi Distributions with Different Degrees of Freedom')
    plt.xlabel('x')
    plt.ylabel('Density')
    plt.legend('top right', labels = ['D = ' + str(i) for i in D_list])
plt.show()

### Problem 1c

In [None]:
def chi_CDF(D, lower_end_point, upper_end_point):
    return scipy.integrate.quad(chi_density, lower_end_point, upper_end_point, args = (D))

x = np.linspace(0.1, 14, 1000)
D = 100
cdf_vals = np.zeros(len(x))
for i in range(len(x)):
    cdf_vals[i] = chi_CDF(D, 0.1, x[i])[0]

plt.plot(x, cdf_vals)
plt.xlim([0, 14])
plt.ylim([0, 1.01])
plt.title('Chi CDF when D = 100')
plt.xlabel('x')
plt.ylabel('P(X < x)')
plt.show()

# Problems 4, 5, 6

### Problem 4

In [38]:
batch_size = 2000
train_iter, val_iter, test_iter, text_field = load_imdbCPU(imdb_path='imdb.zip', imdb_dir='imdb', batch_size=batch_size, gpu=False, reuse=True,\
                                                        repeat=False, shuffle=True)
V = len(text_field.vocab)
num_labels = 2

In [127]:
def NB_train(train_iter, class_labels_dir_prior_params, vocab_length, num_labels, beta0_prior_params=None, beta1_prior_params=None, dir0_unif_prior_param=None, dir1_unif_prior_param=None, maxD=None,\
             betaPrior=True):
    class_labels_dir_posterior_params = torch.addcmul(torch.zeros(num_labels).double(), value=1, tensor1=torch.ones(num_labels).double(), \
                                                      tensor2=class_labels_dir_prior_params)
    if betaPrior == True:
        beta00_posterior_vec = torch.ones(vocab_length).double()*beta0_prior_params[0]
        beta10_posterior_vec = torch.ones(vocab_length).double()*beta0_prior_params[1]
        beta01_posterior_vec = torch.ones(vocab_length).double()*beta1_prior_params[0]
        beta11_posterior_vec = torch.ones(vocab_length).double()*beta1_prior_params[1]
        for batch in train_iter:
            x = bag_of_wordsCPU(batch, text_field).double()
            y = torch.from_numpy(torch.unsqueeze((batch.label - 1).double(), 1).data.numpy()).double()

            x = x.div(x)
            x[x != x] = 0
            
            class_labels_dir_posterior_params.add_(torch.DoubleTensor([sum(1 - y).numpy(), sum(y).numpy()]))
            #beta00_posterior_vec.add_(torch.matmul(torch.transpose(1 - x, 0, 1), 1 - y))
            #beta10_posterior_vec.add_(torch.matmul(torch.transpose(x, 0, 1), 1 - y))
            #beta01_posterior_vec.add_(torch.matmul(torch.transpose(1 - x, 0, 1), y))
            #beta11_posterior_vec.add_(torch.matmul(torch.transpose(x, 0, 1), y))
            for i in range(len(y)):
                if y[i].numpy() == 0:
                    beta00_posterior_vec.add_(1 - x[i])
                    beta10_posterior_vec.add_(x[i])

                if y[i].numpy() == 1:
                    beta01_posterior_vec.add_(1 - x[i])
                    beta11_posterior_vec.add_(x[i])

        class_label_dir_posterior_expectation = class_labels_dir_posterior_params/sum(class_labels_dir_posterior_params)
        beta0_posterior_expectation = beta10_posterior_vec/(beta00_posterior_vec + beta10_posterior_vec)
        beta1_posterior_expectation = beta11_posterior_vec/(beta01_posterior_vec + beta11_posterior_vec)

        return class_label_dir_posterior_expectation, beta0_posterior_expectation, beta1_posterior_expectation
    
    
    if betaPrior == False:
        
        count = 0
        dir0_posterior_matrix = np.zeros((maxD + 1, vocab_length)) + dir0_unif_prior_param
        dir1_posterior_matrix = np.zeros((maxD + 1, vocab_length)) + dir1_unif_prior_param
        
        for batch in train_iter:
            
            print(count, end = ' ')
            count += 1
                
            x = bag_of_wordsCPU(batch, text_field).numpy()
            y = torch.from_numpy(torch.unsqueeze((batch.label - 1).double(), 1).data.numpy()).double()
        
            class_labels_dir_posterior_params.add_(torch.DoubleTensor([sum(1 - y).numpy(), sum(y).numpy()]))
            
            y = (batch.label - 1).data.numpy()
            x0 = x[np.where(y == 0), :][0]
            x1 = x[np.where(y == 1), :][0]
            
            for i in range(x.shape[1]):

                vals0 =  np.unique(x0[:, i], return_counts = True)[0]
                counts0 = np.unique(x0[:, i], return_counts = True)[1]
                vals1 =  np.unique(x1[:, i], return_counts = True)[0]
                counts1 = np.unique(x1[:, i], return_counts = True)[1]

                vals0_maxD = vals0[np.where(vals0 < maxD)].tolist()
                counts0_maxD = counts0[np.where(vals0 < maxD)].tolist()
                vals1_maxD = vals1[np.where(vals1 < maxD)].tolist()
                counts1_maxD = counts1[np.where(vals1 < maxD)].tolist()

                vals0_maxD.append(maxD)
                counts0_maxD.append(np.sum(counts0[np.where(vals0 >= maxD)]))
                vals1_maxD.append(maxD)
                counts1_maxD.append(np.sum(counts1[np.where(vals1 >= maxD)]))
                
                dir0_posterior_matrix[:, i][np.array(vals0_maxD).astype(int)] += np.array(counts0_maxD)
                dir1_posterior_matrix[:, i][np.array(vals1_maxD).astype(int)] += np.array(counts1_maxD)
        
        class_label_dir_posterior_expectation = class_labels_dir_posterior_params/sum(class_labels_dir_posterior_params)
        dir0_posterior_expectation = torch.from_numpy(np.divide(dir0_posterior_matrix, np.sum(dir0_posterior_matrix, axis = 0))).double()
        dir1_posterior_expectation = torch.from_numpy(np.divide(dir1_posterior_matrix, np.sum(dir1_posterior_matrix, axis = 0))).double()
        
        return class_label_dir_posterior_expectation, dir0_posterior_expectation, dir1_posterior_expectation
                                     
        
        #for i in range(len(y)):
        #    if y[i].numpy() == 0:
        #        for j in range(len(x[i])):
        #            if x[i, j].numpy() < maxD:
        #                dir0_posterior_matrix[x[i, j], j].add_(1)
        #            else:
        #                dir0_posterior_matrix[maxD, j].add_(1)
                        
        #    if y[i].numpy() == 1:
        #        for j in range(len(x[i])):
        #            if x[i, j].numpy() < maxD:
        #                dir0_posterior_matrix[x[i, j], j].add_(1)
        #            else:
        #                dir0_posterior_matrix[maxD, j].add_(1)
        
        
        
        
        #dir0_posterior_vec = torch.addcmul(torch.zeros(vocab_length).double(), value=1, tensor1=torch.ones(vocab_length).double(), \
        #                                   tensor2=dir0_prior_params)
        #dir1_posterior_vec = torch.addcmul(torch.zeros(vocab_length).double(), value=1, tensor1=torch.ones(vocab_length).double(), \
        #                                   tensor2=dir1_prior_params)
        
        #for batch in train_iter:
        #    x = bag_of_words(batch, text_field).double()
        #    y = torch.from_numpy(torch.unsqueeze((batch.label - 1).double(), 1).data.numpy()).double()
            
        #    class_labels_dir_posterior_params.add_(torch.DoubleTensor([sum(1 - y).numpy(), sum(y).numpy()]))
            #dir0_posterior_vec.add_(torch.matmul(torch.transpose(x, 0, 1), 1 - y))
            #dir1_posterior_vec.add_(torch.matmul(torch.transpose(x, 0, 1), y))
        #    for i in range(len(y)):
        #        if y[i].numpy() == 0:
        #            dir0_posterior_vec.add_(x[i])

        #        if y[i].numpy() == 1:
        #            dir1_posterior_vec.add_(x[i])
        
        #class_label_dir_posterior_expectation = class_labels_dir_posterior_params/sum(class_labels_dir_posterior_params)
        #dir0_posterior_expectation = dir0_posterior_vec/sum(dir0_posterior_vec)
        #dir1_posterior_expectation = dir1_posterior_vec/sum(dir1_posterior_vec)

In [128]:
def compute_accuracy(dataset, vocab_length, class_label_dir_posterior_expectation, beta0_posterior_expectation=None, beta1_posterior_expectation=None, dir0_posterior_expectation=None, \
                     dir1_posterior_expectation=None, maxD=None, betaPrior = True):
    test_num_correct = torch.zeros(1).double()
    size_test_data = torch.zeros(1).double()
    if betaPrior == True:
        for batch in dataset:
            x = bag_of_wordsCPU(batch, text_field).double()
            y = torch.from_numpy((batch.label - 1).double().data.numpy()).double() # batch.label is 1/2, while we want 0/1

            xi1_ind_class0 = torch.matmul(x, torch.log(beta0_posterior_expectation))
            xi0_ind_class0 = torch.matmul(1.0 - x, torch.log(1.0 - beta0_posterior_expectation))
            log_likelihood_class0 = torch.unsqueeze(torch.log(class_label_dir_posterior_expectation)[0] + xi1_ind_class0 + xi1_ind_class0, 1)

            xi1_ind_class1 = torch.matmul(x, torch.log(beta1_posterior_expectation))
            xi0_ind_class1 = torch.matmul(1.0 - x, torch.log(1.0 - beta1_posterior_expectation))
            log_likelhood_class1 = torch.unsqueeze(torch.log(class_label_dir_posterior_expectation)[1] + xi1_ind_class1 + xi1_ind_class1, 1)

            test_num_correct.add_(sum(torch.from_numpy(np.argmax(torch.cat([log_likelihood_class0, \
                                                                            log_likelhood_class1], 1).numpy(), axis = 1)).double() == y))
            size_test_data.add_(len(y))

        return test_num_correct/size_test_data
    
    if betaPrior == False:
        
        log_dir0_posterior_expectation = np.log(dir0_posterior_expectation.numpy())
        log_dir1_posterior_expectation = np.log(dir1_posterior_expectation.numpy())
        
        for batch in dataset:
            x = bag_of_wordsCPU(batch, text_field).numpy()
            y = torch.from_numpy((batch.label - 1).double().data.numpy()).double() # batch.label is 1/2, while we want 0/1
            
            
            for i in range(x.shape[0]):
                log_likelhood_class0 = np.log(class_label_dir_posterior_expectation.numpy()[0])
                log_likelhood_class1 = np.log(class_label_dir_posterior_expectation.numpy()[1])
                
                np.place(x[i], x[i] > 10, 10)
                
                log_likelhood_class0 = np.sum(log_dir0_posterior_expectation[x[i].astype(int), np.arange(vocab_length)]) + np.log(class_label_dir_posterior_expectation.numpy()[0])
                log_likelhood_class1 = np.sum(log_dir1_posterior_expectation[x[i].astype(int), np.arange(vocab_length)]) + np.log(class_label_dir_posterior_expectation.numpy()[1])
                
                y_pred = float(np.array(np.argmax(np.array([log_likelhood_class0, log_likelhood_class1]))))
                
                test_num_correct.add_(y_pred == y[i])
                size_test_data.add_(1.0)
                                   
            return test_num_correct/size_test_data
                
                                             
            #log_likelihood_class0 = torch.unsqueeze(torch.log(class_label_dir_posterior_expectation)[0] + torch.matmul(x, torch.log(dir0_posterior_expectation)), 1)
            #log_likelhood_class1 = torch.unsqueeze(torch.log(class_label_dir_posterior_expectation)[1] + torch.matmul(x, torch.log(dir1_posterior_expectation)), 1)

            #test_num_correct.add_(sum(torch.from_numpy(np.argmax(torch.cat([log_likelihood_class0, \
            #                                                                log_likelhood_class1], 1).numpy(), axis = 1)).double() == y))
            #size_test_data.add_(len(y))

        #return test_num_correct/size_test_data

In [129]:
prior_param_magnitudes = [0.01, 0.05, 0.1, 0.4, 0.7, 1.0]
maxD = 10
val_accuracies_beta = torch.zeros(len(prior_param_magnitudes)).double()
val_accuracies_dir = torch.zeros(len(prior_param_magnitudes)).double()
max_val_accuracy_beta = torch.zeros(1).double()
max_val_accuracy_dir = torch.zeros(1).double

for i in range(len(prior_param_magnitudes)):
    
    beta0_prior_params = torch.DoubleTensor([prior_param_magnitudes[i], prior_param_magnitudes[i]])
    beta1_prior_params = torch.DoubleTensor([prior_param_magnitudes[i], prior_param_magnitudes[i]])
    class_labels_dir_prior_params = torch.DoubleTensor([1, 1])
    class_label_dir_posterior_expectation, beta0_posterior_expectation, beta1_posterior_expectation = NB_train(train_iter, class_labels_dir_prior_params, V, num_labels, \
                                                                                                              beta0_prior_params, beta1_prior_params)

    dir0_prior_params = torch.ones(V).double()*prior_param_magnitudes[i]
    dir1_prior_params = torch.ones(V).double()*prior_param_magnitudes[i]
    class_labels_dir_prior_params = torch.DoubleTensor([1, 1])
    class_label_dir_posterior_expectation, dir0_posterior_expectation, dir1_posterior_expectation = NB_train(train_iter, class_labels_dir_prior_params, V, num_labels, \
                                                                                                    dir0_unif_prior_param=prior_param_magnitudes[i], dir1_unif_prior_param=prior_param_magnitudes[i], \
                                                                                                    maxD=maxD, betaPrior=False)
    
    val_accuracies_beta[i] = compute_accuracy(val_iter, V, class_label_dir_posterior_expectation, beta0_posterior_expectation, beta1_posterior_expectation)[0]
    
    val_accuracies_dir[i] = compute_accuracy(val_iter, V, class_label_dir_posterior_expectation, dir0_posterior_expectation=dir0_posterior_expectation, \
                                             dir1_posterior_expectation=dir1_posterior_expectation, maxD=maxD, betaPrior=False)[0]
    
    if i == 0:
        max_val_accuracy_beta = val_accuracies_beta[0]
        max_val_accuracy_dir = val_accuracies_dir[0]
        best_beta_prior_param_magnitudes = prior_param_magnitudes[0]
        best_dir_prior_param_magnitudes = prior_param_magnitudes[0]
    
    if i > 0:
        if max_val_accuracy_beta < val_accuracies_beta[i]:
            max_val_accuracy_beta = val_accuracies_beta[i]
            best_beta_prior_param_magnitudes = prior_param_magnitudes[i]
            test_accuracy_best_beta_prior = compute_accuracy(test_iter, V, class_label_dir_posterior_expectation, beta0_posterior_expectation, beta1_posterior_expectation)[0]
        
        if max_val_accuracy_dir < val_accuracies_dir[i]:
            max_val_accuracy_dir = val_accuracies_dir[i]
            best_dir_prior_param_magnitudes = prior_param_magnitudes[i]
            test_accuracy_best_dir_prior = compute_accuracy(test_iter, V, class_label_dir_posterior_expectation, dir0_posterior_expectation=dir0_posterior_expectation, \
                                                            dir1_posterior_expectation=dir1_posterior_expectation, maxD=maxD, betaPrior=False)[0]
    print()        
    print('Val accuracy w/ beta class-conditional prior using uniform hyperparams of magnitude ' + str(prior_param_magnitudes[i]) + ': ' + str(val_accuracies_beta[i]))
    print('Val accuracy w/ Dirichlet class-conditional prior using uniform hyperparams of magnitude ' + str(prior_param_magnitudes[i]) + ': ' + str(val_accuracies_dir[i]))
    print()

print('Test accuracy w/ beta class-conditional prior using best uniform hyperparams (magnitude=' + \
      str(best_beta_prior_param_magnitudes) + '): ' + str(test_accuracy_best_beta_prior))
print('Test accuracy w/ Dirichlet class-conditional prior using best uniform hyperparams (magnitude=' + \
      str(best_dir_prior_param_magnitudes) + '): ' + str(test_accuracy_best_dir_prior))

  from ipykernel import kernelapp as app


0 



1 2 3 4 5 6 7 8 9 10 11 
Val accuracy w/ beta class-conditional prior using uniform hyperparams of magnitude 0.01: 0.855
Val accuracy w/ Dirichlet class-conditional prior using uniform hyperparams of magnitude 0.01: 0.843

0 1 2 3 4 5 6 7 8 9 10 11 
Val accuracy w/ beta class-conditional prior using uniform hyperparams of magnitude 0.05: 0.86
Val accuracy w/ Dirichlet class-conditional prior using uniform hyperparams of magnitude 0.05: 0.854

0 1 2 3 4 5 6 7 8 9 10 11 
Val accuracy w/ beta class-conditional prior using uniform hyperparams of magnitude 0.1: 0.863
Val accuracy w/ Dirichlet class-conditional prior using uniform hyperparams of magnitude 0.1: 0.852

0 1 2 3 4 5 6 7 8 9 10 11 
Val accuracy w/ beta class-conditional prior using uniform hyperparams of magnitude 0.4: 0.865
Val accuracy w/ Dirichlet class-conditional prior using uniform hyperparams of magnitude 0.4: 0.873

0 1 2 3 4 5 6 7 8 9 10 11 
Val accuracy w/ beta class-conditional prior using uniform hyperparams of magnit

### Problem 5b

In [5]:
batch_size = 100
train_iter, val_iter, test_iter, text_field = load_imdb(imdb_path='imdb.zip', imdb_dir='imdb', batch_size=batch_size, gpu=True, reuse=True,\
                                                        repeat=False, shuffle=True)
V = len(text_field.vocab) # vocab size
num_labels = 2
vocab_list = text_field.vocab.itos

In [6]:
def build_model(input_dim, output_dim):
    model = torch.nn.Sequential()
    # computes w_c^T x + b_c 
    model.add_module("linear", torch.nn.Linear(input_dim, output_dim).cuda())
    # Compute our log softmax term.
    model.add_module("softmax", torch.nn.LogSoftmax().cuda())
    return model


def l1_logistic_loss(model, lambda_, fx, y):
    log_loss = torch.nn.NLLLoss(size_average = True)
    log_loss = log_loss.forward(fx, y)
    
    lasso_part = torch.nn.L1Loss(size_average = False)
    params = next(model.parameters())
    target = Variable(torch.zeros(params.size()[0], params.size()[1]).cuda(), requires_grad=False)
    lasso_part = lasso_part.forward(params, target)
    
    return log_loss + lasso_part * lambda_
    
    
def train(model, lambda_, x, y, optimizer):
    # Resets the gradients to 0
    optimizer.zero_grad()
    # Computes the function above. (log softmax w_c^T x + b_c)
    fx = model.forward(x)
    # Computes a loss. Gives a scalar. 
    loss = l1_logistic_loss(model, lambda_, fx, y)
    # Magically computes the gradients. 
    loss.backward()
    # updates the weights
    optimizer.step()
    return loss.data[0]

In [7]:
lam_vals = [0, 0.001, 0.01, 0.1, 1]
num_epochs = 15

for lam in lam_vals:
    model = build_model(V, num_labels)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    size_val_data = 0.0
    val_num_correct = 0.0

    for epoch in range(num_epochs):
        size_training_data = 0.0
        train_num_correct = 0.0
        loss = 0.0

        for batch in train_iter:
            x = Variable(bag_of_words(batch, text_field))
            y = batch.label - 1 # batch.label is 1/2, while we want 0/1

            batch_loss = train(model, lam, x, y, optimizer)
            loss += batch_loss

            batch_num_correct = np.sum(np.argmax(torch.exp(model.forward(x)).data.cpu().numpy(), axis = 1) == y.data.cpu().numpy())
            train_num_correct += batch_num_correct
            size_training_data += len(y)
        
        print('Epoch ' + str(epoch + 1))
        print('Lambda = ' + str(lam))
        print('Epoch train accuracy: ' + str(train_num_correct/size_training_data))
        print('Epoch train loss: ' + str(loss))
        print()
        
    
    for batch in val_iter:
        x = Variable(bag_of_words(batch, text_field))
        y = batch.label - 1 # batch.label is 1/2, while we want 0/1
        
        batch_num_correct = np.sum(np.argmax(torch.exp(model.forward(x)).data.cpu().numpy(), axis = 1) == y.data.cpu().numpy())
        val_num_correct += batch_num_correct
        size_val_data += len(y)
        
    print('Lambda = ' + str(lam))
    print('Val accuracy: ' + str(val_num_correct/size_val_data))
    print()

    
    test_num_correct = 0.0
    size_test_data = 0.0
    for batch in test_iter:
        x = Variable(bag_of_words(batch, text_field))
        y = batch.label - 1 # batch.label is 1/2, while we want 0/1

        batch_num_correct = np.sum(np.argmax(torch.exp(model.forward(x)).data.cpu().numpy(), axis = 1) == y.data.cpu().numpy())
        test_num_correct += batch_num_correct
        size_test_data += len(y)

    print('Lambda: ' + str(lam))
    print('Test accuracy: ' + str(test_num_correct/size_test_data))
    print()

    model_params = next(model.parameters())
    print('Words with highest valued coefficients for predicting class 0: ' + str(np.flip(np.array(vocab_list)[np.argsort(model_params[0].data.cpu().numpy())[-5:]], axis = 0)))
    print('Words with lowest valued coefficients for predicting class 0: ' + str(np.array(vocab_list)[np.argsort(model_params[0].data.cpu().numpy())[0:5]]))
    print('Words with highest valued coefficients for predicting class 1: ' + str(np.flip(np.array(vocab_list)[[np.argsort(model_params[1].data.cpu().numpy())[-5:]]], axis = 0)))
    print('Words with lowest valued coefficients for predicting class 1: ' + str(np.array(vocab_list)[[np.argsort(model_params[1].data.cpu().numpy())[0:5]]]))
    print()
    
    for j in range(num_labels):
        abs_params = np.absolute(model_params.data.cpu().numpy()[j])
        print('Number of 0 valued parameters for class ' + str(j) + ' (lambda=' + str(lam) + '): ' + str(len(abs_params[abs_params < 10.0**(-4)])/V))
    print()

Epoch 1
Lambda = 0
Epoch train accuracy: 0.664833333333
Epoch train loss: 208.94907519221306



KeyboardInterrupt: 

### Problem 6

In [None]:
def build_model_neural(input_dim1, output_dim1, output_dim2):
    model = torch.nn.Sequential()
    # computes w_c^T x + b_c 
    model.add_module("linear1", torch.nn.Linear(input_dim1, output_dim1).cuda())
    model.add_module('tanh', torch.nn.Tanh().cuda())
    model.add_module("linear2", torch.nn.Linear(output_dim1, output_dim2).cuda())
    # Compute our log softmax term.
    model.add_module("softmax", torch.nn.LogSoftmax().cuda())
    return model

In [None]:
model = build_model_neural(V, int(V/1000.0), num_labels)
loss_func = torch.nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
num_epochs = 15

for epoch in range(num_epochs):
    size_training_data = 0.0
    num_correct = 0.0
    loss = 0.0
    
    for batch in train_iter:
        x = Variable(bag_of_words(batch, text_field))
        y = batch.label - 1 # batch.label is 1/2, while we want 0/1
        
        optimizer.zero_grad()
        fx = model.forward(x)
        L = loss_func(fx, y)
        L.backward()
        optimizer.step()
        
        loss += L.data[0]
        batch_num_correct = np.sum(np.argmax(torch.exp(fx).data.cpu().numpy(), axis = 1) == y.data.cpu().numpy())
        num_correct += batch_num_correct
        size_training_data += len(y)
        
    print('Epoch ' + str(epoch + 1))
    print('Epoch train accuracy: ' + str(num_correct/size_training_data))
    print('Epoch loss: ' + str(loss))
    print()
    
    
test_num_correct = 0.0
size_test_data = 0.0
for batch in test_iter:
    x = Variable(bag_of_words(batch, text_field))
    y = batch.label - 1 # batch.label is 1/2, while we want 0/1

    batch_num_correct = np.sum(np.argmax(torch.exp(model.forward(x)).data.cpu().numpy(), axis = 1) == y.data.cpu().numpy())
    test_num_correct += batch_num_correct
    size_test_data += len(y)

print('Test accuracy: ' + str(test_num_correct/size_test_data))
print()