In [31]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, average_precision_score, roc_auc_score

In [2]:
DATA_PATH = "../data/DGraphFin/raw/dgraphfin.npz"

In [7]:
with np.load(DATA_PATH) as loader:
    x_np = loader['x']
    y_np = loader['y']
    edge_index_np = loader['edge_index']
    edge_timestamp_np = loader['edge_timestamp']
    edge_type_np = loader['edge_type']
    max_timestamp = np.max(edge_timestamp_np)

In [4]:
edge_index_np

array([[ 476699, 2915516],
       [ 347800, 1271242],
       [ 154317, 2104635],
       ...,
       [1894383, 1147595],
       [1895741, 1314434],
       [1206795, 2072636]])

In [32]:
def get_data_snapshot(start_time_index, end_time_index, x, y, edge_index, edge_timestamp):
    mask = (edge_timestamp >= start_time_index) & (edge_timestamp < end_time_index)
    selected_edges = edge_index[mask, :]
    unique_nodes = np.unique(selected_edges)
    df = pd.DataFrame({'node_id': unique_nodes})
    df['x'] = list(x[unique_nodes])
    df['y'] = y[unique_nodes]
    df = df[df['y'].isin([0,1])]
    return df

In [34]:
train_df = get_data_snapshot(1,2,x_np,y_np,edge_index_np,edge_timestamp_np)
test_df = get_data_snapshot(2,3,x_np,y_np,edge_index_np,edge_timestamp_np)
X_train = np.vstack(train_df['x'].values)
y_train = train_df['y'].values
X_test = np.vstack(test_df['x'].values)
y_test = test_df['y'].values

In [36]:
X_train

array([[ 1.        ,  6.        ,  0.08      , ...,  0.33333333,
         0.0625    ,  0.33333333],
       [ 1.        ,  4.        ,  0.6       , ...,  0.6       ,
         0.025     ,  0.6       ],
       [ 0.        ,  3.        , -1.        , ..., -1.        ,
        -1.        , -1.        ],
       ...,
       [-1.        , -1.        ,  0.975     , ...,  0.75      ,
         0.01025641,  0.5       ],
       [-1.        , -1.        ,  0.27      , ...,  1.        ,
        -1.        , -1.        ],
       [-1.        , -1.        , -1.        , ..., -1.        ,
        -1.        , -1.        ]])

In [40]:
for i in range(1,70,7):
    print("="*20)
    print(f"index:{i}")
    train_df = get_data_snapshot(i,i+1,x_np,y_np,edge_index_np,edge_timestamp_np)
    test_df = get_data_snapshot(i+1,i+2,x_np,y_np,edge_index_np,edge_timestamp_np)
    test_nodes = set(test_df['node_id'])
    train_df = train_df[~train_df['node_id'].isin(test_nodes)]
    X_train = np.vstack(train_df['x'].values)
    y_train = train_df['y'].values
    X_test = np.vstack(test_df['x'].values)
    y_test = test_df['y'].values
    model = xgb.XGBClassifier(eval_metric='logloss')
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]
    accuracy = accuracy_score(y_test, y_pred)
    avg_precision = average_precision_score(y_test, y_prob)
    auc_roc = roc_auc_score(y_test, y_prob)

    print(f'XGBoost Model Accuracy: {accuracy:.4f}')
    print(f'Average Precision Score: {avg_precision:.4f}')
    print(f'AUC-ROC Score: {auc_roc:.4f}')



index:1
XGBoost Model Accuracy: 0.9765
Average Precision Score: 0.0409
AUC-ROC Score: 0.7075
index:2
XGBoost Model Accuracy: 0.9777
Average Precision Score: 0.0438
AUC-ROC Score: 0.7297
index:3
XGBoost Model Accuracy: 0.9810
Average Precision Score: 0.0479
AUC-ROC Score: 0.7351
index:4
XGBoost Model Accuracy: 0.9739
Average Precision Score: 0.0435
AUC-ROC Score: 0.6876
index:5
XGBoost Model Accuracy: 0.9756
Average Precision Score: 0.0409
AUC-ROC Score: 0.6882
index:6
XGBoost Model Accuracy: 0.9794
Average Precision Score: 0.0389
AUC-ROC Score: 0.7140
index:7
XGBoost Model Accuracy: 0.9759
Average Precision Score: 0.0431
AUC-ROC Score: 0.7005
index:8
XGBoost Model Accuracy: 0.9760
Average Precision Score: 0.0346
AUC-ROC Score: 0.6745
index:9
XGBoost Model Accuracy: 0.9715
Average Precision Score: 0.0560
AUC-ROC Score: 0.7169
