# Code to generate events from a pre-trained LogNormMix-Net model

In [1]:
%load_ext autoreload
%autoreload 2
import dpp
import numpy as np
import torch
from copy import deepcopy
from pylab import rcParams
rcParams['figure.figsize'] = 20, 10
torch.set_default_tensor_type(torch.cuda.FloatTensor)
import matplotlib.pyplot as plt
import cProfile
import pandas as pd
import math

In [2]:
dpp.data.list_datasets()

['enron_email_dataset', 'eu_email_dataset']

In [3]:
# Config
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)
dataset_name = 'enron_email_dataset'  # run dpp.data.list_datasets() to see the list of available datasets

# Model config
## Marks
use_src_marks = True              # Use source marks
src_mark_embedding_size = 24          # Size of the src mark embedding (used as RNN input)
use_dst_marks = True                  # Use destination marks
dst_mark_embedding_size = 24          # Size of the dst mark embedding (used as RNN input)
shared_mark_embedding = False          # Should the source and destination marks share an embedding layer (note, embedding sizes must be the same, and have the same range)

context_size = 64                # Size of the RNN hidden vector
num_mix_components = 30           # Number of components for a mixture model
rnn_type = "LSTM"                  # What RNN to use as an encoder {"RNN", "GRU", "LSTM"}
meta_embedding_size = 16
num_meta_classes = 3
meta_type = 'basic'

# Training config
batch_size = 50       # Number of sequences in a batch

In [4]:
# Load the data
dataset = dpp.data.load_dataset(dataset_name)
d_train, d_val, d_test = dataset.train_val_test_split(seed=seed)

dl_train = d_train.get_dataloader(batch_size=batch_size, shuffle=True)
dl_val = d_val.get_dataloader(batch_size=batch_size, shuffle=False)
dl_test = d_test.get_dataloader(batch_size=batch_size, shuffle=False)

train_end: 86
val_end: 115


In [5]:
# Define the model
print('Building model...')
mean_log_inter_time, std_log_inter_time = d_train.get_inter_time_statistics()

model = dpp.models.LogNormMixNet(
    use_src_marks=use_src_marks,
    use_dst_marks=use_dst_marks,
    num_src_marks=d_train.num_src_marks,
    num_dst_marks=d_train.num_dst_marks,
    num_meta_classes=num_meta_classes,
    meta_type=meta_type,
    mean_log_inter_time=mean_log_inter_time,
    std_log_inter_time=std_log_inter_time,
    context_size=context_size,
    src_mark_embedding_size=src_mark_embedding_size,
    dst_mark_embedding_size=dst_mark_embedding_size,
    shared_mark_embedding = shared_mark_embedding,
    rnn_type=rnn_type,
    num_mix_components=num_mix_components,
    meta_embedding_size=meta_embedding_size
)

Building model...


In [6]:
## Load saved model 
model.load_state_dict(torch.load('./models/enron-event-predict-model'))

<All keys matched successfully>

In [7]:
# data is in hours. 86 weeks is 14448 hours
sampled_batch, t_end = model.sample(0, t_end=14448, batch_size=1)



1000 tensor(1532.3123)
2000 tensor(3345.5059)
3000 tensor(5398.7563)
4000 tensor(7416.7583)
5000 tensor(9253.)
6000 tensor(12681.4941)
7000 tensor(13927.3926)


In [8]:
generated = pd.DataFrame(list(zip(sampled_batch['inter_times'][0].detach().cpu().numpy(), 
                                  sampled_batch['src_marks'][0].detach().cpu().numpy(),
                                  sampled_batch['dst_marks'][0].detach().cpu().numpy(),
                                  sampled_batch['meta'][0].detach().cpu().numpy())), 
               columns =['deltas', 'from', 'to', 'meta']) 
generated['ts'] = generated['deltas'].cumsum() 
generated.to_csv('Enron_generated.csv')

In [9]:
generated.head()

Unnamed: 0,deltas,from,to,meta,ts
0,0.757284,13.0,2.0,0,0.757284
1,0.093819,11.0,124.0,0,0.851103
2,0.436657,3.0,104.0,0,1.28776
3,0.159144,0.0,3.0,0,1.446904
4,0.658236,40.0,6.0,0,2.10514
