In [1]:
import networkx as nx
import pickle
from tqdm import tqdm
import os.path as osp
import pandas as pd
import numpy as np

In [2]:
path = osp.join("../data/EthereumPhishing/raw/", "MulDiGraph.pkl")
with open(path, 'rb') as f:
    G = pickle.load(f)


In [3]:
nodes = list(G.nodes())
node2idx = {n: i for i, n in enumerate(nodes)}
num_nodes = len(nodes)

n2i = node2idx
get = dict.get
data = [
    (n2i[u], n2i[v], int(get(attrs, "timestamp")), float(get(attrs, "amount", 0.0)))
    for u, v, _k, attrs in G.edges(keys=True, data=True)
    if get(attrs, "timestamp") is not None
]

df = pd.DataFrame(data, columns=["u", "v", "timestamp", "amount"])
df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
df["amount"] = pd.to_numeric(df["amount"], errors="coerce").fillna(0.0)

In [4]:
df

Unnamed: 0,u,v,timestamp,amount
0,0,1,2018-05-16 07:01:26+00:00,2.344623
1,0,55647,2017-09-03 18:06:05+00:00,0.070000
2,0,55647,2017-09-03 21:17:00+00:00,0.052111
3,0,286310,2017-10-12 11:03:39+00:00,5.068543
4,0,286310,2018-05-23 17:36:54+00:00,0.992500
...,...,...,...,...
13551298,2973482,1845941,2018-05-20 09:32:24+00:00,0.400000
13551299,2973482,1845941,2018-05-20 10:31:24+00:00,0.750000
13551300,2973486,127998,2018-07-26 12:58:54+00:00,0.010960
13551301,2973488,1143020,2017-12-20 19:25:57+00:00,0.100000


In [5]:
isp_by_idx = pd.Series(
    {node2idx[n]: bool(attrs.get("isp", False)) for n, attrs in G.nodes(data=True)}
)
df["month"] = df["timestamp"].dt.to_period("M").dt.to_timestamp()
nodes_month = (
    df.loc[:, ["month", "u", "v"]]
      .melt(id_vars="month", value_vars=["u", "v"], value_name="node")
      .drop(columns="variable")
)
phish_nodes_month = nodes_month[nodes_month["node"].map(isp_by_idx).fillna(False)]
monthly_counts = (
    phish_nodes_month.drop_duplicates(["month", "node"])
                     .groupby("month").size()
                     .rename("phishing_nodes")
)
full_months = pd.period_range(df["month"].min(), df["month"].max(), freq="M").to_timestamp()
monthly_counts = monthly_counts.reindex(full_months, fill_value=0)

print(monthly_counts)

  df["month"] = df["timestamp"].dt.to_period("M").dt.to_timestamp()


2015-08-01      0
2015-09-01      0
2015-10-01      0
2015-11-01      0
2015-12-01      0
2016-01-01      0
2016-02-01      0
2016-03-01      0
2016-04-01      0
2016-05-01      0
2016-06-01      0
2016-07-01      0
2016-08-01      0
2016-09-01      0
2016-10-01      0
2016-11-01      1
2016-12-01      0
2017-01-01      0
2017-02-01      0
2017-03-01      1
2017-04-01      0
2017-05-01      1
2017-06-01      5
2017-07-01     19
2017-08-01     29
2017-09-01     83
2017-10-01     73
2017-11-01     76
2017-12-01     82
2018-01-01    117
2018-02-01    163
2018-03-01    203
2018-04-01    246
2018-05-01    326
2018-06-01    309
2018-07-01    225
2018-08-01    155
2018-09-01     80
2018-10-01     57
2018-11-01     30
2018-12-01     35
2019-01-01     30
Freq: MS, Name: phishing_nodes, dtype: int64


In [6]:
cutoff = pd.Timestamp("2017-07-01", tz="UTC")
cutoff_s = int(cutoff.timestamp())

edges_iter = G.edges(keys=True, data=True)
ef = []
append = ef.append
get = dict.get

for u, v, k, attrs in edges_iter:
    ts = get(attrs, "timestamp")
    if ts is None:
        continue
    try:
        ts_i = ts if isinstance(ts, (int, float)) else int(ts)
    except (TypeError, ValueError):
        continue
    if ts_i >= cutoff_s:
        append((u, v, k, attrs))

edges_filtered = ef


In [7]:
nodes = {u for u, v, _, _ in edges_filtered} | {v for u, v, _, _ in edges_filtered}
nodes = list(nodes)
node2idx = {n: i for i, n in enumerate(nodes)}
num_nodes = len(nodes)

n2i = node2idx
get = dict.get
data = [
    (n2i[u], n2i[v], int(attrs["timestamp"]), float(get(attrs, "amount", 0.0)))
    for (u, v, _k, attrs) in edges_filtered
]
arr = np.asarray(data, dtype=np.float64)  # fast to construct; cast ints after
df = pd.DataFrame(arr, columns=["u", "v", "timestamp", "amount"])
df[["u", "v", "timestamp"]] = df[["u", "v", "timestamp"]].astype("int64")
df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)


In [8]:
df

Unnamed: 0,u,v,timestamp,amount
0,1707469,155739,2018-05-16 07:01:26+00:00,2.344623
1,1707469,428105,2017-09-03 18:06:05+00:00,0.070000
2,1707469,428105,2017-09-03 21:17:00+00:00,0.052111
3,1707469,1203363,2017-10-12 11:03:39+00:00,5.068543
4,1707469,1203363,2018-05-23 17:36:54+00:00,0.992500
...,...,...,...,...
12417587,1850367,2227944,2018-05-20 09:32:24+00:00,0.400000
12417588,1850367,2227944,2018-05-20 10:31:24+00:00,0.750000
12417589,2572310,69465,2018-07-26 12:58:54+00:00,0.010960
12417590,1733629,2429926,2017-12-20 19:25:57+00:00,0.100000


In [9]:
print(num_nodes)

2737308


In [10]:
print("sdf")

sdf
