In [1]:
import data
import gensim
import neural_net
import torch
import numpy as np

In [2]:
train_df, test_df, train_sentence_list, train_class_list, test_sentence_list, test_class_list = data.get_data()

In [3]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../model/GoogleNews-vectors-negative300.bin', binary=True)

In [4]:
def bring_nn_input(data_list):
    vec_list = []

    for sentence in data_list:

        count = 0
        sum_vec = np.zeros(300)
        
        for word in sentence:
            if word in word2vec_model:
                sum_vec += word2vec_model[word]
                count += 1
        
        if count != 0:
            vec = (sum_vec / count)
    
        vec_list.append(vec.tolist())

    return torch.Tensor(vec_list)

In [5]:
train_vec = bring_nn_input(train_sentence_list)
test_vec = bring_nn_input(test_sentence_list)

In [6]:
neuralNet = neural_net.NeuralNet()
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(neuralNet.parameters(), lr=0.001)

In [10]:
def train_one_epoch(epoch_index):
    running_loss = 0.
    last_loss = 0.
    
    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i in range(10000):
        # Every data instance is an input + label pair
        batch_mask = np.random.choice(train_vec.shape[0], 100)
        inputs = train_vec[batch_mask]
        labels = torch.Tensor(train_class_list)[batch_mask]
        
        # Zero your gradients for every batch!
        optimizer.zero_grad()
        
        # Make predictions for this batch
        outputs = neuralNet(inputs)
        
        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()
        
        # Adjust learning weights
        optimizer.step()
        
        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            # tb_x = epoch_index * len(training_loader) + i + 1
            # tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.
            
    return last_loss

In [11]:
# Initializing in a separate cell so we can easily add more epochs to the same run
# timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 5

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))
    
    # Make sure gradient tracking is on, and do a pass over the data
    neuralNet.train(True)
    avg_loss = train_one_epoch(epoch_number)
    
    # We don't need gradients on to do reporting
    neuralNet.train(False)
    
    running_vloss = 0.0
    # for i, vdata in enumerate(validation_loader):
    vinputs = test_vec
    vlabels = torch.Tensor(test_class_list)

    voutputs = neuralNet(vinputs)

    vloss = loss_fn(voutputs, vlabels)
    running_vloss += vloss
    
    avg_vloss = running_vloss # / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
    
    # Log the running loss averaged per batch
    # for both training and validation
    # writer.add_scalars('Training vs. Validation Loss',
    #                 { 'Training' : avg_loss, 'Validation' : avg_vloss },
    #                 epoch_number + 1)
    # writer.flush()
    
    # # Track best performance, and save the model's state
    # if avg_vloss < best_vloss:
    #     best_vloss = avg_vloss
    #     model_path = 'model_{}_{}'.format(timestamp, epoch_number)
    #     torch.save(neuralNet.state_dict(), model_path)
    
    epoch_number += 1

EPOCH 1:
  batch 1000 loss: 37.895
  batch 2000 loss: 37.588
  batch 3000 loss: 37.739
  batch 4000 loss: 37.865
  batch 5000 loss: 37.819
  batch 6000 loss: 37.944
  batch 7000 loss: 38.02
  batch 8000 loss: 37.816
  batch 9000 loss: 37.811
  batch 10000 loss: 37.753
LOSS train 37.753 valid 50.0
EPOCH 2:
  batch 1000 loss: 37.937
  batch 2000 loss: 38.377
  batch 3000 loss: 37.862
  batch 4000 loss: 37.958
  batch 5000 loss: 37.659
  batch 6000 loss: 37.558
  batch 7000 loss: 37.912
  batch 8000 loss: 37.946
  batch 9000 loss: 37.963
  batch 10000 loss: 37.986
LOSS train 37.986 valid 50.0
EPOCH 3:
  batch 1000 loss: 37.93
  batch 2000 loss: 37.792
  batch 3000 loss: 38.3
  batch 4000 loss: 37.843
  batch 5000 loss: 37.844
  batch 6000 loss: 37.926
  batch 7000 loss: 37.849
  batch 8000 loss: 37.633
  batch 9000 loss: 37.957
  batch 10000 loss: 37.896
LOSS train 37.896 valid 50.0
EPOCH 4:
  batch 1000 loss: 37.861
  batch 2000 loss: 37.731
  batch 3000 loss: 37.603
  batch 4000 loss: 3