In [2]:
import os
from pathlib import Path
import random
import sys
from typing import Optional, Callable, List, Dict, Tuple

import dgmc
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import torch
from torch import Tensor
import funcs
#import pytorch_lightning as pl
from torch.nn import Linear
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import torch_geometric

%reload_ext autoreload
%autoreload 2

device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'
device

'cuda'

In [3]:
file_name = "sizmek_zync_cleaned_793k.csv"
local_file = Path(f"~/{file_name}")

if local_file.is_file():
    df_raw = pd.read_csv(local_file, dtype="str")
else:
    remote_file = f"s3://drose-sandbox/{file_name}"
    df_raw = pd.read_csv(remote_file, dtype="str")


print(f"Shape: {df_raw.shape[0]:,} Memory: {df_raw.memory_usage(deep=True).sum()/1e9:.2f}GB")

Shape: 793,073 Memory: 1.55GB


In [4]:
df_raw.head(1).T

Unnamed: 0,0
Unnamed: 0,0
account_id,19967
url,tvguide.com
city_code,5731371
state_code,OR
dma_code,820
country_code,US
user_id,5314600042548863
session_id,d1b6ae4f-7582-42d9-90bc-9b793bcc9ca7:162949223...
referrer,https://www.google.com/


In [5]:
sizmek_cols = ["user_id", "url", "account_id", "city_code",
               "state_code", "dma_code", "country_code", "sizmek_ip"]
zync_cols = ["session_id", "referrer", "client", "user_agent_platform",
             "user_agent_language", "user_agent_browser", "zync_ip", "zync_country",
             "zync_state", "zync_city", "zync_lat", "zync_long"]

#sizmek_cols = ["user_id", "url", "state_code", "country_code"]
#zync_cols = ["session_id", "referrer", "zync_state", "zync_country"]

df = df_raw[sizmek_cols + zync_cols]
print(df.shape)
#df = df[df[sizmek_cols + zync_cols].astype(bool)]  
#print(df.shape)
df = df.fillna("missing")
print(df.shape)
df = df.drop_duplicates()
print(df.shape)
df = df.reset_index(drop=True)
print(df.shape)

#df = df.sample(frac=.1)

#for column in ["url", "referrer"]:
#    df[column] = df[column].apply(
#        lambda x:funcs.urlparse(x).netloc if pd.notnull(x) else x
#    )

group_list = [e for e in (sizmek_cols + zync_cols) if e not in ("url", "referrer")]
df = df.groupby(group_list)[["url", "referrer"]].agg(lambda x: list(x))
df = df.reset_index()
print(df.shape)
df= df[df["user_id"].duplicated(keep=False)]
print(df.shape)
df = df.drop("user_id", axis=1)

sizmek_features = ["url", "account_id", "city_code", "state_code",
               "dma_code", "country_code", "sizmek_ip"]
zync_features = ["referrer", "client", "user_agent_platform", "user_agent_language", 
             "user_agent_browser", "zync_ip", "zync_country",
             "zync_state", "zync_city", "zync_lat", "zync_long"]

sizmek_df = df[sizmek_features]
zync_df = df[zync_features]

(793073, 20)
(793073, 20)
(793073, 20)
(793073, 20)
(44928, 20)
(30818, 20)


In [6]:
sizmek_df.head(3)

Unnamed: 0,url,account_id,city_code,state_code,dma_code,country_code,sizmek_ip
0,[foxnews.com],19967,4167147,FL,534,US,other
1,[https://www.dailymail.co.uk/tvshowbiz/article...,35927,4167147,FL,534,US,other
2,[buzzaboutbees.net],19967,5816861,WY,758,US,other


In [7]:
zync_df.head(3)

Unnamed: 0,referrer,client,user_agent_platform,user_agent_language,user_agent_browser,zync_ip,zync_country,zync_state,zync_city,zync_lat,zync_long
0,[https://www.giantfreakinrobot.com/ent/snake-e...,sizmek,android,10,chrome,other,US,FL,missing,28.6344,-81.6221
1,[https://www.giantfreakinrobot.com/ent/snake-e...,sizmek,android,10,chrome,other,US,FL,missing,28.6344,-81.6221
2,[https://www.wral.com/wake-county-assistant-pr...,sizmek,iphone,missing,safari,other,US,NC,Wake Forest,35.9825,-78.5376


In [8]:
y_base = list(range(0,30_000))
print(y_base[:4])
random.shuffle(y_base)
print(y_base[:4])

[0, 1, 2, 3]
[1954, 16632, 27518, 20002]


# Define Dataset

In [9]:
class ZetaDataset(torch_geometric.data.InMemoryDataset):
    def __init__(self, root: str, sizmek_df: pd.DataFrame, zync_df: pd.DataFrame, column: str, 
                 label: str, feature_cols=None, parse_url=False, expand_x=None, transform=None, 
                 pre_transform=None
        ):
        self.root = root
        self.sizmek_df = sizmek_df
        self.zync_df = zync_df
        self.column = column
        self.label = label
        self.feature_cols = feature_cols
        self.parse_url = parse_url
        self.expand_x = expand_x
        super(ZetaDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
        print("Removing processed file. . .")
        os.remove(self.processed_paths[0])

    @property
    def raw_file_names(self) -> List[str]:
        return [
            "sizmek_bidstream_raw_20210625_10k.csv", 
            "zync_session_tracking_orc_20210625_10k.csv"
        ]

    @property
    def processed_file_names(self):
        return ["ZetaDataset.pt"]
    
    @property
    def edge_count_1(self):
        return zeta_data.data.edge_index1.shape[1]
    
    @property
    def edge_count_2(self):
        return zeta_data.data.edge_index2.shape[1]

    def download(self):
        pass

    def process(self):
        print("Creating Sizmek graph. . .")
        x1, edge_index1 = self.process_graph(
            self.sizmek_df, 
            self.column[0], 
            self.feature_cols["sizmek"]["categorical"],
            expand_x=None
        )
        print("Creating Zync graph. . .")
        x2, edge_index2 = self.process_graph(
            self.zync_df, 
            self.column[1], 
            self.feature_cols["zync"]["categorical"], 
            expand_x=8410
        )

        #train_y, test_y = self.process_y(x1)
        train_y, test_y = self.process_y(x1)


        data = Data(x1=x1, edge_index1=edge_index1, x2=x2,
                    edge_index2=edge_index2, train_y=train_y,
                    test_y=test_y)
        torch.save(self.collate([data]), self.processed_paths[0])

    def process_graph(self, df, column: str, feature_cols: List, expand_x: int=None):
        #df = pd.read_csv(file_path, low_memory=False)
        #df.columns = [i.split(".")[1] for i in df.columns]

        # parse URLs
        if self.parse_url == True and column in ["url", "referrer"]:
            print("Parsing URL")
            df[column] = df[column].apply(
                lambda x:funcs.urlparse(x).netloc if pd.notnull(x) else x
            )

        # Encode features
        print("Encoding features")
        feature_enc = OneHotEncoder(handle_unknown="ignore")
        features = pd.DataFrame(
            feature_enc.fit_transform(df[feature_cols]).toarray(), 
            columns=feature_enc.get_feature_names(feature_cols)
        )
        
        if expand_x is not None:
            print(f"Expanding X to {expand_x}")
            new_cols = [f"fake_{expand_x-i}" for i in range(expand_x - features.shape[1])][::-1]
            for col in new_cols:
                features[col] = 0
        x = torch.tensor(features.values, dtype=torch.float)

        # Create edge list
        print(f"Creating edge list on {column}")
        blocklist = funcs.count_edge_frequency(df, column)
        print(f"Unique URLs: {len(blocklist):,}")
        
        blocklist = blocklist[blocklist["count"] > 10]["url"]
        print(f"Common URLs blocked: {len(blocklist):,}")

        edges = funcs.connect_edges(df, column, blocklist)
        edge_index = torch.tensor(
            edges[["source", "target"]].T.values, dtype=torch.long
        )

        return x, edge_index

    def process_y(self, x1, train_frac=.8) -> Tensor:
        y = list(range(0,x1.shape[0]))
        random.shuffle(y)
        
        train_samples = int(len(y)*train_frac)
        y_train = torch.tensor(y[:train_samples])
        y_test = torch.tensor(y[train_samples:])
        
        y_train = torch.stack([y_train, y_train], dim=0)
        y_test = torch.stack([y_test, y_test], dim=0)
        return y_train, y_test
    
    #def process_fake_y(self, x1) -> Tensor:
    #    y = range(0, x1.shape[0])
    #    y_1 = torch.tensor(y)
    #    y_2 = torch.tensor(y)
    #    train_y = torch.stack([y_1, y_2], dim=0)
    #    return train_y, train_y
    
    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({self.edge_count_1:,}/{self.edge_count_2:,} edges)"
    
class SumEmbedding(object):
    def __call__(self, data):
        data.x1, data.x2 = data.x1.sum(dim=1), data.x2.sum(dim=1)
        return data

In [10]:
#sizmek_small = sizmek_df.sample(n=20_000, random_state=0)
#zync_small = zync_df.iloc[sizmek_small.index,:]

sizmek_small = sizmek_df.copy()
zync_small = zync_df.copy()

In [11]:
df.sample()

Unnamed: 0,account_id,city_code,state_code,dma_code,country_code,sizmek_ip,session_id,client,user_agent_platform,user_agent_language,user_agent_browser,zync_ip,zync_country,zync_state,zync_city,zync_lat,zync_long,url,referrer
26517,19967,4170688,FL,539,US,other,9ebeee1a-d450-40d1-a1de-d820a31eaf67:157279852...,sizmek,android,missing,chrome,other,US,FL,missing,28.6344,-81.6221,"[mail.yahoo.com, finance.yahoo.com, https://fi...",[https://89f9b0ac4b8797faca2566d56f04c137.safe...


# Create Dataset

In [12]:
feature_cols = {
    "sizmek": {
        "categorical": [
            "account_id", "city_code", "state_code",
            "dma_code", "country_code", "sizmek_ip"
        ],
        "numeric": []
    },
    "zync": {
        "categorical": [
#            "client", "user_agent_platform", "user_agent_language", 
            "user_agent_browser", "zync_ip", "zync_country",
            "zync_state", "zync_city"
        ],
        "numeric": ["zync_lat", "zync_long"]
    }
}

zeta_data = ZetaDataset(
    root="./data/",
    sizmek_df = sizmek_small,
    zync_df = zync_small,
    column=["url", "referrer"],
    label=["zeta_user_id", "client_id"],
    feature_cols=feature_cols,
    parse_url=False,
    transform=SumEmbedding()
)
zeta_data

Creating Sizmek graph. . .
Encoding features


Processing...


Creating edge list on url
Unique URLs: 88,218
Common URLs blocked: 10,156
Creating Zync graph. . .
Encoding features
Expanding X to 8410


  features[col] = 0


Creating edge list on referrer
Unique URLs: 18,801
Common URLs blocked: 9,685


Done!


Removing processed file. . .


ZetaDataset(848,363/296,109 edges)

In [13]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--dim', type=int, default=256)
parser.add_argument('--rnd_dim', type=int, default=32)
parser.add_argument('--num_layers', type=int, default=2)
parser.add_argument('--num_steps', type=int, default=10)
parser.add_argument('--k', type=int, default=10)
args = parser.parse_args("")


psi_1 = dgmc.models.RelCNN(zeta_data.data.x1.size(-1), args.dim, args.num_layers, batch_norm=False,
               cat=True, lin=True, dropout=0.5)
psi_2 = dgmc.models.RelCNN(args.rnd_dim, args.rnd_dim, args.num_layers, batch_norm=False,
               cat=True, lin=True, dropout=0.5)

print(psi_1)
print(psi_2)
print(zeta_data.data.x1.size())
print(zeta_data.data.x2.size())

RelCNN(8410, 256, num_layers=2, batch_norm=False, cat=True, lin=True, dropout=0.5)
RelCNN(32, 32, num_layers=2, batch_norm=False, cat=True, lin=True, dropout=0.5)
torch.Size([30818, 8410])
torch.Size([30818, 8410])


In [14]:
model = dgmc.models.DGMC(psi_1, psi_2, num_steps=None, k=args.k).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
data = zeta_data.data

def train():
    model.train()
    optimizer.zero_grad()

    _, S_L = model(data.x1, data.edge_index1, None, None, data.x2,
                   data.edge_index2, None, None, data.train_y)

    loss = model.loss(S_L, data.train_y)
    loss.backward()
    optimizer.step()
    return loss

@torch.no_grad()
def test():
    model.eval()

    _, S_L = model(data.x1, data.edge_index1, None, None, data.x2,
                   data.edge_index2, None, None)

    train_hits1 = model.acc(S_L, data.train_y)
    train_hits10 = model.hits_at_k(10, S_L, data.train_y)
    train_hits100 = model.hits_at_k(100, S_L, data.train_y)
    
    test_hits1 = model.acc(S_L, data.test_y)
    test_hits10 = model.hits_at_k(10, S_L, data.test_y)
    test_hits100 = model.hits_at_k(100, S_L, data.test_y)

    return train_hits1, train_hits10, train_hits100, test_hits1, test_hits10, test_hits100

In [15]:
torch.cuda.empty_cache()
model.cuda()
zeta_data.data.cuda()

model.num_steps = 0
with torch.profiler.profile(
    schedule=torch.profiler.schedule(
        wait=2,
        warmup=2,
        active=6,
        repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler("./logs/gnn_v5"),
    with_stack=True
) as profiler:
    for epoch in range(1, 400):
        #print("Epoch: {}".format(epoch))
        if epoch == 100:
            print('Refine correspondence matrix...')
            model.num_steps = args.num_steps
            model.detach = True

        loss = train()
        profiler.step()
        
        if epoch % 5 == 0:
            train_hits1, train_hits10, train_hits100, test_hits1, test_hits10, test_hits100 = test()
            print((
                f"{epoch:03d}: Loss: {loss:.2f}, "
                f"Train: {train_hits1:.4f}, {train_hits10:.4f}, {train_hits100:.4f}, "
                f"Test: {test_hits1:.4f}, {test_hits10:.4f}, {test_hits100:.4f}"
            ))

005: Loss: 2.99, Train: 0.0003, 0.0012, 0.0012, Test: 0.0000, 0.0011, 0.0011
010: Loss: 2.99, Train: 0.0004, 0.0022, 0.0022, Test: 0.0002, 0.0011, 0.0011
015: Loss: 2.98, Train: 0.0005, 0.0026, 0.0026, Test: 0.0002, 0.0013, 0.0013
020: Loss: 2.97, Train: 0.0004, 0.0028, 0.0028, Test: 0.0003, 0.0016, 0.0016
025: Loss: 2.97, Train: 0.0008, 0.0028, 0.0028, Test: 0.0002, 0.0018, 0.0018
030: Loss: 2.94, Train: 0.0007, 0.0028, 0.0028, Test: 0.0002, 0.0019, 0.0019
035: Loss: 2.92, Train: 0.0004, 0.0026, 0.0026, Test: 0.0002, 0.0019, 0.0019
040: Loss: 2.91, Train: 0.0004, 0.0023, 0.0023, Test: 0.0003, 0.0019, 0.0019
045: Loss: 2.88, Train: 0.0003, 0.0019, 0.0019, Test: 0.0003, 0.0011, 0.0011
050: Loss: 2.81, Train: 0.0002, 0.0015, 0.0015, Test: 0.0000, 0.0010, 0.0010
055: Loss: 2.72, Train: 0.0002, 0.0013, 0.0013, Test: 0.0000, 0.0010, 0.0010
060: Loss: 2.65, Train: 0.0002, 0.0014, 0.0014, Test: 0.0000, 0.0010, 0.0010
065: Loss: 2.76, Train: 0.0003, 0.0014, 0.0014, Test: 0.0000, 0.0010, 0.0010

KeyboardInterrupt: 

# Debug

In [None]:
model.eval()
_, S_L = model(
    data.x1, data.edge_index1, None, None, data.x2,
    data.edge_index2, None, None
)

model.acc(S_L, data.test_y)

In [None]:
S = S_L
y = data.test_y
pred = S.__idx__[y[0], S.__val__[y[0]].argmax(dim=-1)]
pred

In [None]:
data.test_y

In [None]:
(pred == y[1]).sum().item()
