In [1]:
import os
from pathlib import Path
import random
import sys
from typing import Optional, Callable, List, Dict, Tuple

import dgmc
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler

import torch
from torch import Tensor
import funcs
#import pytorch_lightning as pl
from torch.nn import Linear
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import torch_geometric

%reload_ext autoreload
%autoreload 2

device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'
device

'cuda'

In [2]:
file_name = "sizmek_zync_cleaned_793k.csv"
local_file = Path(f"/me/ubuntu/{file_name}")

if local_file.is_file():
    print("Loading local file")
    df_raw = pd.read_csv(local_file, dtype="str")
else:
    print("Reading from S3")
    remote_file = f"s3://drose-sandbox/{file_name}"
    df_raw = pd.read_csv(remote_file, dtype="str")


print(f"Shape: {df_raw.shape[0]:,} Memory: {df_raw.memory_usage(deep=True).sum()/1e9:.2f}GB")

df_raw = df_raw.sample(frac=.2)

Reading from S3
Shape: 806,255 Memory: 1.57GB


In [3]:
df_raw.head(1).T

Unnamed: 0,779995
Unnamed: 0,779995
account_id,35927
url,https://screenrant.com/most-important-upgrades...
city_code,5391811
state_code,CA
dma_code,825
country_code,US
user_id,2019934806039952643
session_id,a1060640-46cc-45b7-a23e-4f5b6ebd116d:160503854...
referrer,https://recipefy.net/how-to-make-simple-hunter...


In [4]:
###############################
##### Filter and clean up #####
###############################

sizmek_cols = ["user_id", "url", "account_id", "city_code",
               "state_code", "dma_code", "country_code", "sizmek_ip"]
zync_cols = ["session_id", "referrer", "client", "user_agent_platform",
             "user_agent_language", "user_agent_browser", "zync_ip", "zync_country",
             "zync_state", "zync_city"]

df = df_raw[sizmek_cols + zync_cols]
print(df.shape)
#df = df[df[sizmek_cols + zync_cols].astype(bool)]  
#print(df.shape)
df = df.fillna("missing")
print(df.shape)
df = df.drop_duplicates()
print(df.shape)
df = df.reset_index(drop=True)
print(df.shape)

# Sample if needed
#df = df.sample(frac=.3)

###############################
##### Feature engineering #####
###############################

# Find Sizmek coordinates
print("Finding Sizmek coordinates. . .")
sizmek_coords = funcs.ip_to_coords(df["sizmek_ip"])
sizmek_coords.columns = ["sizmek_lat", "sizmek_long"]
for col in sizmek_coords.columns:
    df[col] = sizmek_coords[col]
    
# Find Zync coordinates
print("Finding Zync coordinates. . .")
zync_coords = funcs.ip_to_coords(df["zync_ip"])
zync_coords.columns = ["zync_lat", "zync_long"]
for col in zync_coords.columns:
    df[col] = zync_coords[col]
    
coord_cols = list(sizmek_coords.columns) + list(zync_coords.columns)

cols = ["state_code", "country_code", "zync_country", "zync_state", "sizmek_ip", "zync_ip"]
for col in cols:
    df[col] = funcs.replace_low_counts(df[col], 5)

group_cols = sizmek_cols + zync_cols + coord_cols
group_list = [e for e in group_cols if e not in ("url", "referrer")]
df = df.groupby(group_list)[["url", "referrer"]].agg(lambda x: list(x))
df = df.reset_index()
print(df.shape)
#df= df[df["user_id"].duplicated(keep=False)]
#print(df.shape)
#df = df.drop("user_id", axis=1)

sizmek_features = [
    "url", "account_id", "city_code", "state_code", "dma_code",
    "country_code", "sizmek_ip", "sizmek_lat", "sizmek_long"
]

zync_features = [
    "referrer", "client", "user_agent_platform", "user_agent_language", 
    "user_agent_browser", "zync_ip", "zync_country", "zync_state",
    "zync_city", "zync_lat", "zync_long"
]

sizmek_df = df[sizmek_features]
zync_df = df[zync_features]

(161251, 18)
(161251, 18)
(161251, 18)
(161251, 18)
Finding Sizmek coordinates. . .
Located 156,973/161,251 coordinates
Finding Zync coordinates. . .
Located 161,251/161,251 coordinates
(25460, 22)


In [5]:
sizmek_df.head(3)

Unnamed: 0,url,account_id,city_code,state_code,dma_code,country_code,sizmek_ip,sizmek_lat,sizmek_long
0,[foxnews.com],19967,4167147,FL,534,US,other,28.527781,-81.523132
1,[https://www.dailymail.co.uk/tvshowbiz/article...,35927,4167147,FL,534,US,other,28.527781,-81.523132
2,[buzzaboutbees.net],19967,5816861,WY,758,US,other,47.561195,-122.153412


In [6]:
zync_df.head(3)

Unnamed: 0,referrer,client,user_agent_platform,user_agent_language,user_agent_browser,zync_ip,zync_country,zync_state,zync_city,zync_lat,zync_long
0,[https://www.giantfreakinrobot.com/ent/snake-e...,sizmek,android,10,chrome,other,US,FL,missing,28.527781,-81.523132
1,[https://www.giantfreakinrobot.com/ent/snake-e...,sizmek,android,10,chrome,other,US,FL,missing,28.527781,-81.523132
2,[https://www.wral.com/wake-county-assistant-pr...,sizmek,iphone,missing,safari,other,US,NC,Wake Forest,35.97987,-78.50972


# Define Dataset

In [7]:
class ZetaDataset(torch_geometric.data.InMemoryDataset):
    def __init__(self, root: str, sizmek_df: pd.DataFrame, zync_df: pd.DataFrame, column: str, 
                 label: str, feature_cols=None, parse_url=False, expand_x=None, transform=None, 
                 pre_transform=None
        ):
        self.root = root
        self.sizmek_df = sizmek_df
        self.zync_df = zync_df
        self.column = column
        self.label = label
        #self.feature_cols = feature_cols
        self.parse_url = parse_url
        self.expand_x = expand_x
        super(ZetaDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
        print("Removing processed file. . .")
        os.remove(self.processed_paths[0])

    @property
    def raw_file_names(self) -> List[str]:
        return [
            "sizmek_bidstream_raw_20210625_10k.csv", 
            "zync_session_tracking_orc_20210625_10k.csv"
        ]

    @property
    def processed_file_names(self):
        return ["ZetaDataset.pt"]
    
    @property
    def edge_count_1(self):
        return zeta_data.data.edge_index1.shape[1]
    
    @property
    def edge_count_2(self):
        return zeta_data.data.edge_index2.shape[1]

    def download(self):
        pass
    
    def encode_features(self, sizmek_df, zync_df):
        print("Encoding features. . .")
        
        sizmek_cols = ["country_code", "state_code", "sizmek_ip"]
        zync_cols = ["zync_country", "zync_state", "zync_ip"]

        sizmek_x = []
        zync_x = []
        for ix, column in enumerate(sizmek_cols):
            print(f"Encoding columns: {sizmek_cols[ix]}, {zync_cols[ix]}")

            # Declare encoder for new column set
            enc = OneHotEncoder(handle_unknown="ignore")

            # Fit to combined data and transform
            combined_df = pd.DataFrame(
                pd.concat(
                    objs=[sizmek_df[sizmek_cols[ix]], zync_df[zync_cols[ix]]],
                    axis=0
                )
            )
            enc.fit(combined_df)
            siz_enc = enc.transform(sizmek_df[[sizmek_cols[ix]]]).toarray()
            print("sizmek:",siz_enc.size)
            zync_enc = enc.transform(zync_df[[zync_cols[ix]]]).toarray()
            print("zync:",zync_enc.size)

            # append to dataset list
            sizmek_x.append(siz_enc)
            zync_x.append(zync_enc)

        # Combine to a single array for each dataset
        sizmek_x = np.concatenate(sizmek_x, axis=1)
        zync_x = np.concatenate(zync_x, axis=1)
        print(f"Sizmek encoded array shape: {sizmek_x.shape}")
        print(f"Zync encoded array shape: {zync_x.shape}")
        return sizmek_x, zync_x

    def process(self):
        # Encode categorical features
        sizmek_cat, zync_cat = self.encode_features(sizmek_df, zync_df)
        
        # Normalize numeric features
        print("Normalizing numeric features. . .")
        sizmek_numeric_cols = ["sizmek_lat", "sizmek_long"]
        zync_numeric_cols = ["zync_lat", "zync_long"]
        x_num = np.concatenate((
            sizmek_df[sizmek_numeric_cols].to_numpy(),
            zync_df[zync_numeric_cols].to_numpy()
        ), axis=0)
        print("Numeric shape: ", x_num.shape)
        scaler = StandardScaler()
        scaler.fit(x_num)
        print(f"Scaler mean: {scaler.mean_.mean()}")
        sizmek_num = scaler.transform(sizmek_df[sizmek_numeric_cols])
        zync_num = scaler.transform(zync_df[zync_numeric_cols])
        print("Scaled numeric features")

        # Merge categorical and numeric features
        print("Combining Sizmek categorical and numeric features")
        sizmek_x = np.concatenate(
            (sizmek_cat, sizmek_num),
            axis=1
        )

        print("Combining Zync categorical and numeric features")
        zync_x = np.concatenate(
            (zync_cat, zync_num),
            axis=1
        )  

        print("Creating Sizmek graph. . .")
        x1, edge_index1 = self.process_graph(
            self.sizmek_df,
            self.column[0],
            #self.feature_cols["sizmek"]["categorical"],
            encoded_features=sizmek_x,
            expand_x=None
        )

        print("Creating Zync graph. . .")
        x2, edge_index2 = self.process_graph(
            self.zync_df, 
            self.column[1], 
            #self.feature_cols["zync"]["categorical"], 
            encoded_features=zync_x,
            expand_x=None
        )
        
        

        
        train_y, test_y = self.process_y(x1)

        data = Data(x1=x1, edge_index1=edge_index1, x2=x2,
                    edge_index2=edge_index2, train_y=train_y,
                    test_y=test_y)
        torch.save(self.collate([data]), self.processed_paths[0])

    def process_graph(self, df, column: str, encoded_features=None, expand_x: int=None):
        #df = pd.read_csv(file_path, low_memory=False)
        #df.columns = [i.split(".")[1] for i in df.columns]

        # parse URLs
        if self.parse_url == True and column in ["url", "referrer"]:
            print("Parsing URL")
            df[column] = df[column].apply(
                lambda x:funcs.urlparse(x).netloc if pd.notnull(x) else x
            )

        # Encode features
        if encoded_features is None:
            print("Encoding categorical features")
            feature_enc = OneHotEncoder(handle_unknown="ignore")
            features = pd.DataFrame(
                feature_enc.fit_transform(df[feature_cols]).toarray(), 
                columns=feature_enc.get_feature_names(feature_cols)
            )
        else:
            print("Features already encoded")
            features = encoded_features
            
        if expand_x is not None:
            print(f"Expanding X to {expand_x}")
            new_cols = [f"fake_{expand_x-i}" for i in range(expand_x - features.shape[1])][::-1]
            for col in new_cols:
                features[col] = 0
            features = features.values

        x = torch.tensor(features, dtype=torch.float) 

        # Create edge list
        print(f"Creating edge list on {column}")
        blocklist = funcs.count_edge_frequency(df, column)
        print(f"Unique URLs: {len(blocklist):,}")
        
        blocklist = blocklist[blocklist["count"] > 10]["url"]
        print(f"Common URLs blocked: {len(blocklist):,}")

        edges = funcs.connect_edges(df, column, blocklist)
        edge_index = torch.tensor(
            edges[["source", "target"]].T.values, dtype=torch.long
        )

        return x, edge_index

    def process_y(self, x1, train_frac=.8) -> Tensor:
        y = list(range(0,x1.shape[0]))
        #random.shuffle(y)
        
        train_samples = int(len(y)*train_frac)
        y_train = torch.tensor(y[:train_samples])
        y_test = torch.tensor(y[train_samples:])
        
        y_train = torch.stack([y_train, y_train], dim=0)
        y_test = torch.stack([y_test, y_test], dim=0)
        return y_train, y_test
    
    #def process_fake_y(self, x1) -> Tensor:
    #    y = range(0, x1.shape[0])
    #    y_1 = torch.tensor(y)
    #    y_2 = torch.tensor(y)
    #    train_y = torch.stack([y_1, y_2], dim=0)
    #    return train_y, train_y
    
    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({self.edge_count_1:,}/{self.edge_count_2:,} edges)"
    
class SumEmbedding(object):
    def __call__(self, data):
        data.x1, data.x2 = data.x1.sum(dim=1), data.x2.sum(dim=1)
        return data

In [8]:
#sizmek_small = sizmek_df.sample(n=20_000, random_state=0)
#zync_small = zync_df.iloc[sizmek_small.index,:]

sizmek_small = sizmek_df.copy()
zync_small = zync_df.copy()

In [9]:
#%history -g -f history.txt

In [10]:
df.sample()

Unnamed: 0,user_id,account_id,city_code,state_code,dma_code,country_code,sizmek_ip,session_id,client,user_agent_platform,...,zync_ip,zync_country,zync_state,zync_city,sizmek_lat,sizmek_long,zync_lat,zync_long,url,referrer
276,1173539549219780327,35927,5394409,CA,803,US,other,5959b91c-abf2-4540-b14a-0807ab7ada82:151743738...,sizmek,windows,...,other,US,CA,North Hills,33.919182,-118.416473,33.919182,-118.416473,[https://mrfz.fandom.com/wiki/Ceylon/Trivia],[https://www.creativeuncut.com/art_nier-automa...


# Create Dataset

In [11]:
zeta_data = ZetaDataset(
    root="./data/",
    sizmek_df = sizmek_small,
    zync_df = zync_small,
    column=["url", "referrer"],
    label=["zeta_user_id", "client_id"],
    #feature_cols=feature_cols,
    parse_url=False,
    transform=SumEmbedding()
)
zeta_data

Processing...


Encoding features. . .
Encoding columns: country_code, zync_country
sizmek: 1196620
zync: 1196620
Encoding columns: state_code, zync_state
sizmek: 6441380
zync: 6441380
Encoding columns: sizmek_ip, zync_ip
sizmek: 156528080
zync: 156528080
Sizmek encoded array shape: (25460, 6448)
Zync encoded array shape: (25460, 6448)
Normalizing numeric features. . .
Numeric shape:  (50920, 2)
Scaler mean: -25.971679671582866
Scaled numeric features
Combining Sizmek categorical and numeric features
Combining Zync categorical and numeric features
Creating Sizmek graph. . .
Features already encoded
Creating edge list on url
Unique URLs: 47,189
Common URLs blocked: 2,241
Creating Zync graph. . .
Features already encoded
Creating edge list on referrer
Unique URLs: 22,258
Common URLs blocked: 2,724


Done!


Removing processed file. . .


ZetaDataset(284,192/244,246 edges)

In [12]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--dim', type=int, default=256)
parser.add_argument('--rnd_dim', type=int, default=32)
parser.add_argument('--num_layers', type=int, default=3)
parser.add_argument('--num_steps', type=int, default=10)
parser.add_argument('--k', type=int, default=10)
args = parser.parse_args("")

psi_1 = dgmc.models.RelCNN(zeta_data.data.x1.size(-1), args.dim, args.num_layers, batch_norm=False,
               cat=False, lin=True, dropout=0.3)
psi_2 = dgmc.models.RelCNN(args.rnd_dim, args.rnd_dim, args.num_layers, batch_norm=False,
               cat=False, lin=True, dropout=0.3)

print(psi_1)
print(psi_2)
print(zeta_data.data.x1.size())
print(zeta_data.data.x2.size())

RelCNN(6450, 256, num_layers=3, batch_norm=False, cat=False, lin=True, dropout=0.3)
RelCNN(32, 32, num_layers=3, batch_norm=False, cat=False, lin=True, dropout=0.3)
torch.Size([25460, 6450])
torch.Size([25460, 6450])


In [13]:
model = dgmc.models.DGMC(psi_1, psi_2, num_steps=None, k=args.k).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
data = zeta_data.data

def train():
    model.train()
    optimizer.zero_grad()

    _, S_L = model(data.x1, data.edge_index1, None, None, data.x2,
                   data.edge_index2, None, None, data.train_y)

    loss = model.loss(S_L, data.train_y)
    loss.backward()
    optimizer.step()
    return loss

@torch.no_grad()
def test():
    model.eval()

    _, S_L = model(data.x1, data.edge_index1, None, None, data.x2,
                   data.edge_index2, None, None)

    train_hits1 = model.acc(S_L, data.train_y)
    train_hits10 = model.hits_at_k(10, S_L, data.train_y)
    train_hits100 = model.hits_at_k(10000, S_L, data.train_y)
    
    test_hits1 = model.acc(S_L, data.test_y)
    test_hits10 = model.hits_at_k(10, S_L, data.test_y)
    test_hits100 = model.hits_at_k(10000, S_L, data.test_y)

    return train_hits1, train_hits10, train_hits100, test_hits1, test_hits10, test_hits100

In [None]:
torch.cuda.empty_cache()
model.cuda()
zeta_data.data.cuda()

model.num_steps = 0
with torch.profiler.profile(
    schedule=torch.profiler.schedule(
        wait=2,
        warmup=2,
        active=6,
        repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler("./logs/gnn_v5"),
    with_stack=True
) as profiler:
    for epoch in range(1, 5000):
        #print("Epoch: {}".format(epoch))
        if epoch == 100:
            print('Refine correspondence matrix...')
            model.num_steps = args.num_steps
            model.detach = True

        loss = train()
        profiler.step()
        
        if epoch % 5 == 0:
            train_hits1, train_hits10, train_hits100, test_hits1, test_hits10, test_hits100 = test()
            print((
                f"{epoch:03d}: Loss: {loss:.2f}, "
                f"Train: {train_hits1:.4f}, {train_hits10:.4f}, {train_hits100:.4f}, "
                f"Test: {test_hits1:.4f}, {test_hits10:.4f}, {test_hits100:.4f}"
            ))

005: Loss: 3.02, Train: 0.0007, 0.0047, 0.0047, Test: 0.0004, 0.0018, 0.0018
010: Loss: 3.01, Train: 0.0011, 0.0069, 0.0069, Test: 0.0000, 0.0010, 0.0010
015: Loss: 3.01, Train: 0.0007, 0.0038, 0.0038, Test: 0.0000, 0.0010, 0.0010
020: Loss: 3.01, Train: 0.0022, 0.0160, 0.0160, Test: 0.0004, 0.0020, 0.0020
025: Loss: 3.00, Train: 0.0053, 0.0244, 0.0244, Test: 0.0006, 0.0037, 0.0037
030: Loss: 3.00, Train: 0.0067, 0.0331, 0.0331, Test: 0.0008, 0.0045, 0.0045
035: Loss: 3.00, Train: 0.0104, 0.0412, 0.0412, Test: 0.0008, 0.0049, 0.0049
040: Loss: 3.00, Train: 0.0126, 0.0525, 0.0525, Test: 0.0016, 0.0055, 0.0055
045: Loss: 3.00, Train: 0.0166, 0.0633, 0.0633, Test: 0.0014, 0.0090, 0.0090
050: Loss: 3.00, Train: 0.0201, 0.0731, 0.0731, Test: 0.0031, 0.0126, 0.0126
055: Loss: 3.00, Train: 0.0232, 0.0818, 0.0818, Test: 0.0045, 0.0179, 0.0179
060: Loss: 3.00, Train: 0.0258, 0.0889, 0.0889, Test: 0.0053, 0.0238, 0.0238
065: Loss: 3.00, Train: 0.0295, 0.0947, 0.0947, Test: 0.0059, 0.0295, 0.0295

# Debug

In [None]:
torch.cuda.empty_cache()
#model.to('cpu')
#zeta_data.data.to('cpu')

model.eval()
_, S_L = model(data.x1, data.edge_index1, None, None, data.x2,
                   data.edge_index2, None, None)

k = 1000
S = S_L
y = data.train_y

perm = S.__val__[y[0]].argsort(dim=-1, descending=True)[:, :20]
pred = torch.gather(S.__idx__[y[0]], -1, perm)
pred[0,:]

In [None]:
S

In [None]:
S.__val__.shape

In [None]:
model.eval()
_, S_L = model(
    data.x1, data.edge_index1, None, None, data.x2,
    data.edge_index2, None, None
)

model.acc(S_L, data.test_y)

In [None]:
S = S_L
y = data.test_y
pred = S.__idx__[y[0], S.__val__[y[0]].argmax(dim=-1)]
pred

In [None]:
data.test_y

In [None]:
(pred == y[1]).sum().item()
