In [1]:
import torch
import torch.nn as nn

from torchtext.vocab import GloVe
from torchtext.data.utils import get_tokenizer

from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

This process want to be a first try to the MULTIVARIATE LSTM model that can accept multiple variable as input and then return a precision prediction for every variable.

Next steps: 
1. make return only one variable that represent the type of log we are waiting

In [None]:
FEATURE = [
    "userAgent"
]

In [2]:
# change in the loading of the data => loading at realtime
import pathlib
class UnificatedDataset(Dataset):
    def __init__(self, path, filename, tokenizer, embeddings_vector):
        self.path = pathlib.Path(path) 
        self.path = self.path / filename

        # check if file exists
        if not self.path.exists():
            print("Cannot load dataset, Path did not exists")
            return

        self.infile = open(path)
        self.lines = self.infile.readlines()

        self.tokenizer = tokenizer
        self.embeddings_vector = embeddings_vector

        self.cache = [None]*len(self.lines)
    
    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        if self.cache[idx] is None:
            tokens = self.tokenizer(self.lines[idx])
            
            # Retrieve embeddings for tokens
            embeddings = self.embeddings_vector.get_vecs_by_token(tokens, lower_case_backup=True)
            self.cache[idx] = embeddings 
            return embeddings
        return self.cache[idx]

In [None]:
# Load GloVe '42B' embeddings
global_vectors = GloVe(name='42B', dim=300)
# Tokenize your text
tokenizer = get_tokenizer('basic_english')
# division btw train/test must be done in the files instead of the data
input = "../../data/prepared/"
test_file = "logfile_test.ndjson"
train_file = "logfile_train.ndjson"

unificated_test = UnificatedDataset(
    input, test_file,                   # file input and position
    tokenizer=tokenizer,                # tokenizer setting
    embeddings_vector=global_vectors    # embeddings
)
unificated_train = UnificatedDataset(
    input, train_file, 
    tokenizer=tokenizer, 
    embeddings_vector=global_vectors
)

In [None]:
test_loader = DataLoader(unificated_test)
train_loader = DataLoader(unificated_train)

In [None]:
# Set device for Torch running
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

In [None]:
for _, batch in enumerate(train_loader):
    x_batch, y_batch = batch[0].to(device), batch[1].to(device)
    print("Train Loader: ", x_batch.shape, y_batch.shape)
    break

# check on data loaded
for _, batch in enumerate(test_loader):
    x_batch, y_batch = batch[0].to(device), batch[1].to(device)
    print("Test Loader: ", x_batch.shape, y_batch.shape)
    break

In [None]:
# cost: cross-entropy

class LSTMMultiVariate(nn.Module):
	def __init__(self, input_size, hidden_size, num_layers, l_in_feature):
		super().__init__()
		self.input_size = input_size
		self.hidden_size = hidden_size
		self.num_layers = num_layers
		
		self.l_in_feature = l_in_feature
		
		# torch.nn.Linear(in_features, out_features, bias=True, device=None, dtype=None)
		self.l0 = nn.Linear(self.l_in_feature, self.input_size)
		# torch.nn.LSTM(input_size, hidden_size, num_layers=1, bias=True, batch_first=False, 
		# dropout=0.0, bidirectional=False, proj_size=0, device=None, dtype=None)
		self.l1 = nn.LSTM(self.input_size, self.hidden_size, self.num_layers)
		
		self.l2 = nn.Linear(self.hidden_size, self.l_in_feature)
		
	def forward(self, x):
		# run the first layer
		x = self.l0(x)
		# run the LSTM layer (l1)
		batch_size = x.size(0)
		h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
		c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
		x, _ = self.l1(x, (h0, c0))
		
		# Run the last layer => with or without the ReLU. 
		# Starting from "with"
		#x = self.l2(x)
		x = torch.relu(self.l2(x))
		return x

model = LSTMMultiVariate(len(FEATURE), 128, 4)
model.to(device)

model

In [None]:
learning_rate = 1e-3
num_epochs = 350
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
WHEN_TO_SEE_INFO = 99

def train_one_epoch():
	model.train(True)
	print(f"Epoch: {epoch+1}")
	running_loss = 0.0

	for batch_index, batch in tqdm(enumerate(train_loader)):
		x_batch, y_batch = batch[0].to(device), batch[1].to(device)

		output = model(x_batch)
		loss = loss_function(output, y_batch)
		running_loss += loss.item()

		optimizer.zero_grad()
		loss.backward()
		optimizer.step()

		if batch_index % WHEN_TO_SEE_INFO == 0:
			avg_loss_across_batches = running_loss/WHEN_TO_SEE_INFO
			print("Batch {0}, Loss {1:.10f}".format(
				batch_index+1,
				avg_loss_across_batches
			))
			running_loss = 0.0
def validate_one_epoch():
	model.train(False)
	running_loss = 0.0

	for batch_index in enumerate(test_loader):
		x_batch, y_batch = batch[0].to(device), batch[1].to(device)

		with torch.no_grad():
			output = model(x_batch)
			loss = loss_function(output, y_batch)
			running_loss += loss.item()

		avg_loss_across_batches = running_loss / len(test_loader)

	print("Val Loss: {0:.10f}".format(avg_loss_across_batches))
	print('***************************************************')

for epoch in range(num_epochs):
	train_one_epoch()
	validate_one_epoch()