In [1]:
import data
import gensim
import simple_net
import torch
import numpy as np
import pandas as pd
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
train_df, test_df = data.get_data()

In [3]:
train_sentences = train_df.sentence.values
train_labels = train_df.label.values

test_sentences = test_df.sentence.values
test_labels = test_df.label.values

In [4]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../model/GoogleNews-vectors-negative300.bin', binary=True)

In [5]:
def bring_nn_input(sentences):
    vec_list = []

    for sentence in sentences:
        words = sentence.strip().split()

        count = 0
        sum_vec = np.zeros(300)

        for word in words:
            if word in word2vec_model:
                sum_vec += word2vec_model[word]
                count += 1
        
        if count != 0:
            vec = (sum_vec / count)
    
        vec_list.append(vec)

    return np.array(vec_list)

In [6]:
train_vec = bring_nn_input(train_sentences)
test_vec = bring_nn_input(test_sentences)

In [7]:
input = 300
hidden=100
output = 2

In [8]:
sn_model = simple_net.Simple_Net(input, hidden, output)

In [9]:
features_train, features_test, labels_train, labels_test = train_test_split(train_vec, train_labels, shuffle=True, random_state=34)

In [10]:
# params
optimizer = torch.optim.Adam(sn_model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [11]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/usairline{}'.format(timestamp))

In [17]:
def sn_train(epochs):
    x_train = Variable(torch.from_numpy(features_train)).float()
    y_train = Variable(torch.from_numpy(labels_train)).long()

    x_test = Variable(torch.from_numpy(features_test)).float()
    y_test = Variable(torch.from_numpy(labels_test)).long()

    for epoch in range(epochs):
        sn_model.train()
        
        optimizer.zero_grad()
        y_pred_train = sn_model(x_train)
        loss_train = criterion(y_pred_train, y_train)
        
        # print ("epoch #",epoch)
        print ("train loss: ", loss_train.item())
        pred_train = torch.max(y_pred_train, 1)[1].eq(y_train).sum()
        print ("train acc:(%) ", 100*pred_train/len(x_train))

        tb_x = epoch
        # writer.add_scalar('Loss/train', loss_train.item(), tb_x)

        loss_train.backward()
        optimizer.step()

        sn_model.eval()
        with torch.no_grad():
            y_pred_test = sn_model(x_test)
            loss_test = criterion(y_pred_test, y_test)
            
            # print ("epoch #",epoch)
            print ("test loss: ", loss_test.item())
            pred_test = torch.max(y_pred_test, 1)[1].eq(y_test).sum()
            print ("test acc (%): ", 100*pred_test/len(x_test))
        
        writer.add_scalars('SimpleNet Training vs. Testing Loss',
                    { 'Train' : loss_train.item(), 'Test' : loss_test.item() },
                    tb_x + 1)
        
        writer.add_scalars('SimpleNet Training vs. Testing Accuracy',
                    { 'Train' : 100*pred_train/len(x_train), 'Test' : 100*pred_test/len(x_test) },
                    tb_x + 1)

In [14]:
def sn_test(epochs):
    sn_model.eval()
    x_test = Variable(torch.from_numpy(features_test)).float()
    y_test = Variable(torch.from_numpy(labels_test)).long()
    for epoch in range(epochs):
        with torch.no_grad():
            y_pred = sn_model(x_test)
            loss = criterion(y_pred, y_test)
            print ("epoch #",epoch)
            print ("loss: ", loss.item())
            pred = torch.max(y_pred, 1)[1].eq(y_test).sum()
            print ("acc (%): ", 100*pred/len(x_test))

In [15]:
def sn_real_test(epochs):
    sn_model.eval()
    x_test = Variable(torch.from_numpy(test_vec)).float()
    y_test = Variable(torch.from_numpy(test_labels)).long()
    for epoch in range(epochs):
        with torch.no_grad():
            y_pred = sn_model(x_test)
            loss = criterion(y_pred, y_test)
            print ("epoch #",epoch)
            print ("loss: ", loss.item())
            pred = torch.max(y_pred, 1)[1].eq(y_test).sum()
            print ("acc (%): ", 100*pred/len(x_test))

In [18]:
sn_train(100)

0  train loss:  0.6982799768447876
1  train loss:  0.6631367802619934
2  train loss:  0.6772649884223938
3  train loss:  0.6467200517654419
4  train loss:  0.652622640132904
5  train loss:  0.6458669304847717
6  train loss:  0.6286985874176025
7  train loss:  0.603245735168457
8  train loss:  0.5661724209785461
9  train loss:  0.5220828652381897
10  train loss:  0.4905472695827484
11  train loss:  0.49301955103874207
12  train loss:  0.5840985178947449
13  train loss:  0.4555674195289612
14  train loss:  0.47621071338653564
15  train loss:  0.416436105966568
16  train loss:  0.4487517178058624
17  train loss:  0.45910707116127014
18  train loss:  0.42673230171203613
19  train loss:  0.4062543213367462
20  train loss:  0.40585413575172424
21  train loss:  0.38097551465034485
22  train loss:  0.370319664478302
23  train loss:  0.37835225462913513
24  train loss:  0.3511509597301483
25  train loss:  0.35929521918296814
26  train loss:  0.33906492590904236
27  train loss:  0.33496153354644

In [None]:
sn_test(1)

epoch # 0
loss:  0.5792011618614197
acc (%):  tensor(82.5706)


In [None]:
sn_real_test(1)

epoch # 0
loss:  0.7074867486953735
acc (%):  tensor(80.)
