# #1: Import Packages

In [None]:
# General 
import sys, numpy as np, pandas as pd, math, matplotlib.pyplot as plt, datetime, copy, os

# Pytorch, pytorch Geometric
import torch, torch_geometric
from torch_geometric.data import HeteroData
from torch_geometric.nn import MetaPath2Vec

# Sklearn
import sklearn

sys.path.insert(1, '/home/ec2-user/SageMaker/repos/fredriks-thesis/python')
import helper_functions, graph_preprocessing, nn_models, hetero_models, graphSage_models, div_models

# #2: Settings

In [None]:
model_file_path = "/home/ec2-user/SageMaker/repos/fredriks-thesis/notebooks/09_model_performance_script/models/"

settings = {
    'dataset': 1e4
    ,'seed': 0}

# #3: Load Dataset and Data Preprocessing

In [None]:
filepath = '/home/ec2-user/SageMaker/s3/exploration-876679093433-ew1-initiative-pop-amlanalysis/data/fredriks-thesis/heterographs_01/'
filename = "heterograph_externalnodes_{:.0f}.pt".format(settings['dataset'])

data = torch.load(filepath+filename)

# Removing the attribute globalRiskScore
data['ind'].x = torch.cat((data['ind'].x[:,0:4], data['ind'].x[:,5:data['ind'].x.shape[1]]), 1)
data['org'].x = torch.cat((data['org'].x[:,0:3], data['org'].x[:,4:data['ind'].x.shape[1]]), 1)
#data['ind'].attr_names.remove('globalRiskScore')
#data['org'].attr_names.remove('globalRiskScore')

torch.manual_seed(settings['seed']) # Setting torch random state seed

# Create num_features variables
data['ind'].num_features = data['ind'].x.shape[1]
data['org'].num_features = data['org'].x.shape[1]
data['ext'].num_features = data['ext'].x.shape[1]

# Reversing all edges 
data = graph_preprocessing.reverse_edges(data)
# Applying log to node feature transaction amounts and edge feature transaction amounts: 
data = graph_preprocessing.apply_log_to_txns(data)
# Normalizing node features
data = graph_preprocessing.normalize_node_features(data)
# Scaling edge_attributes to be in range [0.01,1]
data = graph_preprocessing.scaling_edge_attr(data)


# Adding dummy-features for role-edges; ones for all edges
data[('ind', 'role', 'org')].edge_attr = torch.ones([data[('ind', 'role', 'org')].edge_index.shape[1],1], dtype = torch.float32)
data[('org', 'rev_role', 'ind')].edge_attr = torch.ones([data[('org', 'rev_role', 'ind')].edge_index.shape[1],1], dtype = torch.float32)

# Define device and transfer data to device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Settings

In [None]:
embedding_dim = 16
batch_size = int(math.pow(2,13))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# IND

In [None]:
metapaths = [
[('ind', 'txn', 'org'),('org', 'txn', 'ind')],
[('ind', 'txn', 'ext'),('ext', 'txn', 'ind')],
[('ind', 'role', 'org'),('org', 'txn', 'ind')]
]

my_embeddings_ind = []

for it in range(len(metapaths)):
    print("Iteration: {}".format(it))
    my_filename = "metapath_features_ind_{}_{:.0f}.pt".format(it+1,settings['dataset'])
    if os.path.isfile(filepath + my_filename):
        print("Skipping Iteration: {}".format(it))
        continue

    model = MetaPath2Vec(data.edge_index_dict, embedding_dim=embedding_dim,
                         metapath=metapaths[it], walk_length=20, context_size=10,
                         walks_per_node=10, num_negative_samples=1, 
                         sparse=True).to(device)

    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)
    loader = model.loader(batch_size=batch_size, shuffle=True, num_workers=4)

    print("Number of train-batches: {}".format(len(iter(loader))))


    ## Train ##
    start_time_total = helper_functions.stopwatch()

    max_epochs = 500
    check_progress_frequency = 1
    train_hist = pd.DataFrame( columns = ['loss'])

    for epoch in range(max_epochs):
        model.train()
        total_loss = 0
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()


        ## Tracking Progress ##
        train_hist.loc[epoch] = pd.Series({  
            'loss': total_loss
         })

        # Early Stopping
        if train_hist.loc[epoch].loss > min(train_hist.loss):
            print("Early stopping at epoch {}".format(epoch));
            torch.save(model('ind').cpu(), filepath+my_filename)
            break;

        if epoch%check_progress_frequency == 0 or epoch==max_epochs-1:
            tms = divmod((datetime.datetime.now() - start_time_total).days * 86400 + (datetime.datetime.now() - start_time_total).seconds, 60)  
            print("Epoch #: {} finished, Loss: {:.2f}, Time Elapsed: {} min {} sek".format(epoch, total_loss, tms[0], tms[1]))  


    helper_functions.stopwatch(start_time_total)
    helper_functions.sound_alert()

    # Plotting loss curve
    lw = 1
    plt.figure(figsize = (5,5))
    ax = plt.subplot(1, 1, 1)
    ax.plot(train_hist, label = 'loss', linewidth = lw, color = 'blue');
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    ax.set_yscale('log');
    ax.legend();

    my_embeddings_ind.append(model('ind'))

helper_functions.sound_alert()

# ORG

In [None]:
from torch_geometric.nn import MetaPath2Vec

metapaths = [
[('org', 'txn', 'ind'),('ind', 'txn', 'org')],
[('org', 'txn', 'ext'),('ext', 'txn', 'org')],
[('org', 'txn', 'ind'),('ind', 'role', 'org')]
]

my_embeddings_org = []

for it in range(len(metapaths)):
    print("Iteration: {}".format(it))
    my_filename = "metapath_features_org_{}_{:.0f}.pt".format(it+1,settings['dataset'])
    if os.path.isfile(filepath + my_filename):
        print("Skipping Iteration: {}".format(it))
        continue


    model = MetaPath2Vec(data.edge_index_dict, embedding_dim=embedding_dim,
                         metapath=metapaths[it], walk_length=20, context_size=10,
                         walks_per_node=10, num_negative_samples=1, 
                         sparse=True).to(device)

    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)
    loader = model.loader(batch_size=batch_size, shuffle=True, num_workers=4)

    print("Number of train-batches: {}".format(len(iter(loader))))


    ## Train ##
    start_time_total = helper_functions.stopwatch()

    max_epochs = 500
    check_progress_frequency = 1
    train_hist = pd.DataFrame( columns = ['loss'])

    for epoch in range(max_epochs):
        model.train()
        total_loss = 0
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()


        ## Tracking Progress ##
        train_hist.loc[epoch] = pd.Series({  
            'loss': total_loss
         })

        # Early Stopping
        if train_hist.loc[epoch].loss > min(train_hist.loss):
            print("Early stopping at epoch {}".format(epoch));
            torch.save(model('org').cpu(), filepath+my_filename)
            break;

        if epoch%check_progress_frequency == 0 or epoch==max_epochs-1:
            tms = divmod((datetime.datetime.now() - start_time_total).days * 86400 + (datetime.datetime.now() - start_time_total).seconds, 60)  
            print("Epoch #: {} finished, Loss: {:.2f}, Time Elapsed: {} min {} sek".format(epoch, total_loss, tms[0], tms[1]))  


    helper_functions.stopwatch(start_time_total)
    helper_functions.sound_alert()

    # Plotting loss curve
    lw = 1
    plt.figure(figsize = (5,5))
    ax = plt.subplot(1, 1, 1)
    ax.plot(train_hist, label = 'loss', linewidth = lw, color = 'blue');
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    ax.set_yscale('log');
    ax.legend();

    my_embeddings_org.append(model('org'))

helper_functions.sound_alert()