# #1: Import Packages

In [None]:
# General 
import sys, numpy as np, pandas as pd, math, matplotlib.pyplot as plt, datetime, copy, os

# Pytorch, pytorch Geometric
import torch, torch_geometric
from torch_geometric.data import HeteroData

# Sklearn
import sklearn

sys.path.insert(1, '/home/ec2-user/SageMaker/repos/fredriks-thesis/python')
import helper_functions, graph_preprocessing, nn_models, hetero_models, graphSage_models, div_models

# #2: Settings

In [None]:
model_file_path = "/home/ec2-user/SageMaker/repos/fredriks-thesis/notebooks/09_model_performance_script/models/"

settings = {
    'dataset': 1e4
    ,'seed': 0}

# #3: Load Dataset and Data Preprocessing

In [None]:
filepath = '/home/ec2-user/SageMaker/s3/exploration-876679093433-ew1-initiative-pop-amlanalysis/data/fredriks-thesis/heterographs_01/'
filename = "heterograph_externalnodes_{:.0f}.pt".format(settings['dataset'])

data = torch.load(filepath+filename)

torch.manual_seed(settings['seed']) # Setting torch random state seed

# Create num_features variables
data['ind'].num_features = data['ind'].x.shape[1]
data['org'].num_features = data['org'].x.shape[1]
data['ext'].num_features = data['ext'].x.shape[1]

# Reversing all edges 
data = graph_preprocessing.reverse_edges(data)
# Applying log to node feature transaction amounts and edge feature transaction amounts: 
data = graph_preprocessing.apply_log_to_txns(data)
# Normalizing node features
data = graph_preprocessing.normalize_node_features(data)
# Scaling edge_attributes to be in range [0.01,1]
data = graph_preprocessing.scaling_edge_attr(data)


# Adding dummy-features for role-edges; ones for all edges
data[('ind', 'role', 'org')].edge_attr = torch.ones([data[('ind', 'role', 'org')].edge_index.shape[1],1], dtype = torch.float32)
data[('org', 'rev_role', 'ind')].edge_attr = torch.ones([data[('org', 'rev_role', 'ind')].edge_index.shape[1],1], dtype = torch.float32)

# Define device and transfer data to device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#data = data.to(device)

# Create homogeneous graphs from input

In [None]:
num_ind = data['ind'].x.shape[0]
num_org = data['org'].x.shape[0]
num_ext = data['ext'].x.shape[0]

########################################################################
# data_homo_2
########################################################################
data_homo_ind = torch_geometric.data.Data(x=torch.ones(num_ind), edge_index=  data[('ind', 'txn', 'ind')].edge_index)
data_homo_org = torch_geometric.data.Data(x=torch.ones(num_org), edge_index=  data[('org', 'txn', 'org')].edge_index)

# Node2vec with pyG

In [None]:
# ind

embedding_dim = 16
batch_size = int(math.pow(2,15))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = torch_geometric.nn.Node2Vec(data_homo_ind.edge_index, embedding_dim=embedding_dim, walk_length=20,
                 context_size=10, walks_per_node=10,
                 num_negative_samples=1, p=1, q=1, sparse=True, num_nodes = num_ind).to(device)

optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)
loader = model.loader(batch_size=batch_size, shuffle=True, num_workers=4)

print("Number of train-batches: {}".format(len(iter(loader))))

## Train ##
start_time_total = helper_functions.stopwatch()

max_epochs = 200
check_progress_frequency = 1
train_hist = pd.DataFrame( columns = ['loss'])

for epoch in range(max_epochs):
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    

    ## Tracking Progress ##
    train_hist.loc[epoch] = pd.Series({  
        'loss': total_loss
     })
    
    # Early Stopping
    if train_hist.loc[epoch].loss > min(train_hist.loss):
        print("Early stopping at epoch {}".format(epoch));
        break;
    
    if epoch%check_progress_frequency == 0 or epoch==max_epochs-1:
        tms = divmod((datetime.datetime.now() - start_time_total).days * 86400 + (datetime.datetime.now() - start_time_total).seconds, 60)  
        print("Epoch #: {} finished, Loss: {:.2f}, Time Elapsed: {} min {} sek".format(epoch, total_loss, tms[0], tms[1]))  
        

helper_functions.stopwatch(start_time_total)
helper_functions.sound_alert()

# Plotting loss curve
lw = 1
plt.figure(figsize = (5,5))
ax = plt.subplot(1, 1, 1)
ax.plot(train_hist, label = 'loss', linewidth = lw, color = 'blue');
plt.xlabel('Iteration')
plt.ylabel('Loss')
ax.set_yscale('log');
ax.legend();

param_list = []
for param in model.parameters(): param_list.append(param.cpu().detach())
embeddings_ind = param_list[0]

helper_functions.sound_alert()

In [None]:
# ORG
batch_size = int(math.pow(2,10))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = torch_geometric.nn.Node2Vec(data_homo_org.edge_index, embedding_dim=embedding_dim, walk_length=20,
                 context_size=10, walks_per_node=10,
                 num_negative_samples=1, p=1, q=1, sparse=True, num_nodes = num_org).to(device)

optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)
loader = model.loader(batch_size=batch_size, shuffle=True, num_workers=4)

print("Number of train-batches: {}".format(len(iter(loader))))

## Train ##
start_time_total = helper_functions.stopwatch()

max_epochs = 200
check_progress_frequency = 1
train_hist = pd.DataFrame( columns = ['loss'])

for epoch in range(max_epochs):
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    

    ## Tracking Progress ##
    train_hist.loc[epoch] = pd.Series({  
        'loss': total_loss
     })
    
    # Early Stopping
    if train_hist.loc[epoch].loss > min(train_hist.loss):
        print("Early stopping at epoch {}".format(epoch));
        break;
    
    if epoch%check_progress_frequency == 0 or epoch==max_epochs-1:
        tms = divmod((datetime.datetime.now() - start_time_total).days * 86400 + (datetime.datetime.now() - start_time_total).seconds, 60)  
        print("Epoch #: {} finished, Loss: {:.2f}, Time Elapsed: {} min {} sek".format(epoch, total_loss, tms[0], tms[1]))  
    
helper_functions.stopwatch(start_time_total)
helper_functions.sound_alert()

# Plotting loss curve
lw = 1
plt.figure(figsize = (5,5))
ax = plt.subplot(1, 1, 1)
ax.plot(train_hist, label = 'loss', linewidth = lw, color = 'blue');
plt.xlabel('Iteration')
plt.ylabel('Loss')
ax.set_yscale('log');
ax.legend();

param_list = []
for param in model.parameters(): param_list.append(param.cpu().detach())
embeddings_org = param_list[0]

helper_functions.sound_alert()

# writing to file

In [None]:
my_filename_ind = "deepwalk_features_ind_{:.0f}.pt".format(settings['dataset'])
torch.save(embeddings_ind, filepath+my_filename_ind)


my_filename_org = "deepwalk_features_org_{:.0f}.pt".format(settings['dataset'])
torch.save(embeddings_org, filepath+my_filename_org)

helper_functions.sound_alert()