# Graph Matching Consensus

## Initialize and Import

In [1]:
from typing import Optional, Callable, List, Dict, Tuple
import os

import dgmc
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import torch
from torch import Tensor
import funcs
import pytorch_lightning as pl
from torch.nn import Linear
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import torch_geometric

%reload_ext autoreload
%autoreload 2

device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Load Data

In [2]:
class GCN(pl.LightningModule):
    def __init__(self, dataset):
        super(GCN, self).__init__()
        torch.manual_seed(0)
        #self.conv1 = GCNConv(dataset.num_features, 4)
        self.conv1 = GCNConv(596, 4)
        self.conv2 = GCNConv(4, 4)
        self.conv3 = GCNConv(4, 2)
        self.classifier = Linear(2, dataset.num_classes)

        self.in_channels = dataset.num_features
        self.out_channels = dataset.num_classes

    def forward(self, x, edge_index, edge_attr=None):
        print("x: ", x.shape)
        print("edge_index: ", edge_index.shape)
        #print("self: ", self.)
        # mat1 and mat2 shapes cannot be multiplied (10000x9070 and 596x4)
        h = self.conv1(x, edge_index)
        h = h.tanh()
        h = self.conv2(h, edge_index)
        h = h.tanh()
        h = self.conv3(h, edge_index)
        h = h.tanh()  # Final GNN embedding space.

        # Apply a final (linear) classifier.
        out = self.classifier(h)

        return out, h


In [3]:
sizmek_path = "data/sizmek_bidstream_raw_20210625_10k.csv"
zync_path = "data/zync_session_tracking_orc_20210625_10k.csv"

sizmek_cols = ["account_id", "referrer_url", "city_code",
               "state_code", "dma_code", "country_code"]
zync_cols = ["client", "user_agent_platform",
             "user_agent_language", "user_agent_browser"]

# Load and Create Sizmek Data
sizmek_data = funcs.ZetaData(sizmek_path, "url", "zeta_user_id", sizmek_cols, parse_url=False)
sizmek_model = GCN(sizmek_data)

# Load and Create Zync Data
zync_data = funcs.ZetaData(zync_path, "referrer", "client_id", zync_cols, parse_url=False, expand_x=596)
zync_model = GCN(zync_data)


print(f"Sizmek: (nodes:{sizmek_data.node_count:,}, edges:{sizmek_data.edge_count:,})")
print(f"Zync: (nodes:{zync_data.node_count:,}, edges:{zync_data.edge_count:,})")

  features[column] = 0


Sizmek: (nodes:10,000, edges:2,212,814)


  features[column] = 0


Zync: (nodes:10,000, edges:110,023)


In [4]:
psi_1 = dgmc.models.GIN(
    in_channels=sizmek_data.num_features,
    out_channels=256,
    num_layers=3
)

psi_2 = dgmc.models.GIN(
    in_channels=32,
    out_channels=32,
    num_layers=3
)

model = dgmc.DGMC(psi_1, psi_2, num_steps=None, k=10)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

# Create fake labels for testing
y_1 = torch.tensor([range(0,2000)])[0]
y_2 = torch.tensor([range(0,2000)])[0]
train_y = torch.stack([y_1, y_2], dim=0)


def train():
    model.train()
    optimizer.zero_grad()
    _, out = model.forward(
        x_s=sizmek_data.x,
        edge_index_s=sizmek_data.edge_index,
        edge_attr_s=None,
        batch_s=None,
        x_t=zync_data.x,
        edge_index_t=zync_data.edge_index,
        edge_attr_t=None,
        batch_t=None,
        y=None
    )
    loss = model.loss(out, train_y)
    loss.backward()
    optimizer.step()
    return loss

@torch.no_grad()
def test():
    model.eval()

    _, out = model.forward(
        x_s=sizmek_data.x,
        edge_index_s=sizmek_data.edge_index,
        edge_attr_s=None,
        batch_s=None,
        x_t=zync_data.x,
        edge_index_t=zync_data.edge_index,
        edge_attr_t=None,
        batch_t=None,
        y=None
    )

    hits1 = model.acc(out, train_y)
    hits10 = model.hits_at_k(10, out, train_y)

    return hits1, hits10

In [16]:
print('Optimize initial feature matching...')
model.num_steps = 0
for epoch in range(1, 201):
    print("Epoch: ", epoch)
    if epoch == 5:
        print('Refine correspondence matrix...')
        model.num_steps = 3
        model.detach = True

    loss = train()

    #if epoch % 10 == 0 or epoch > 100:
    if True:
        hits1, hits10 = test()
        print((f'{epoch:03d}: Loss: {loss:.4f}, Hits@1: {hits1:.4f}, '
               f'Hits@10: {hits10:.4f}'))

Optimize initial feature matching...
Epoch:  1


  self[col] = value


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

------------

# Sratch Pad

In [None]:
class LightningDGMC(dgmc.DGMC):
    def __init__(self, psi_1, psi_2, num_steps=None, k=10):
        super(LightningDGMC, self).__init__(psi_1, psi_2, num_steps)

    def training_step(self, train_batch, batch_idx):
        _, out = self.forward(
            x_s=train_batch["sizmek"].x,
            edge_index_s=train_batch["sizmek"].edge_index,
            edge_attr_s=None,
            batch_s=None,
            x_t=train_batch["zync"].x,
            edge_index_t=train_batch["zync"].edge_index,
            edge_attr_t=None,
            batch_t=None,
            y=None
        )
        loss = model.loss(out, train_batch.train_y)
        return loss

    def validation_step(self, val_batch):
        _, out = self.forward(
            x_s=val_batch["sizmek"].x,
            edge_index_s=val_batch["sizmek"].edge_index,
            edge_attr_s=None,
            batch_s=None,
            x_t=val_batch["zync"].x,
            edge_index_t=val_batch["zync"].edge_index,
            edge_attr_t=None,
            batch_t=None,
            y=None
        )

        hits1 = self.acc(out, val_batch["val_y"])
        hits10 = self.hits_at_k(10, out, val_batch["val_y"])

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.0001)
        return optimizer

In [None]:
y_1 = torch.tensor([range(0,2000)])[0].to(device)
y_2 = torch.tensor([range(0,2000)])[0].to(device)
train_y = torch.stack([y_1, y_2], dim=0).to(device)

hm = torch_geometric.data.Data(
    x=sizmek_data.x, edge_index=sizmek_data.edge_index, 
    edge_attr=None, 
    y=train_y, 
    pos=None, 
    normal=None, 
    face=None
)
hm

In [None]:
class ZetaDataset(torch_geometric.data.InMemoryDataset):
    def __init__(self, root: str, column: str, label: str, feature_cols=None,
                 parse_url=False, expand_x=None, transform=None, pre_transform=None
        ):
        self.root = root
        self.column = column
        self.label = label
        self.feature_cols = feature_cols
        self.parse_url = parse_url
        self.expand_x = expand_x
        super(ZetaDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self) -> List[str]:
        return [
            "sizmek_bidstream_raw_20210625_10k.csv", 
            "zync_session_tracking_orc_20210625_10k.csv"
        ]

    @property
    def processed_file_names(self):
        return ["ZetaDataset.pt"]

    def download(self):
        pass

    def process(self):
        sizmek_path = os.path.join(self.root, self.raw_file_names[0])
        zync_path = os.path.join(self.root, self.raw_file_names[1])
        
        print("Loading", sizmek_path)
        x1, edge_index1 = self.process_graph(sizmek_path, self.column[0], self.feature_cols[0])
        x2, edge_index2 = self.process_graph(zync_path, self.column[1], self.feature_cols[1])

        train_y = self.process_y()
        test_y = self.process_y()

        data = Data(x1=x1, edge_index1=edge_index1, x2=x2,
                    edge_index2=edge_index2, train_y=train_y,
                    test_y=test_y)
        torch.save(self.collate([data]), self.processed_paths[0])

    def process_graph(self, file_path, column, feature_cols):
        print(f"Processing graph for {file_path} on {column}")
        df = pd.read_csv(file_path, low_memory=False)
        df.columns = [i.split(".")[1] for i in df.columns]

        # parse URLs
        if self.parse_url == True and column in ["url", "referrer"]:
            df[column] = df[column].apply(
                lambda x:urlparse(x).netloc if pd.notnull(x) else x
            )

        # Encode features
        feature_enc = OneHotEncoder(handle_unknown="ignore")
        features = pd.DataFrame(
            feature_enc.fit_transform(df[feature_cols]).toarray(), 
            columns=feature_enc.get_feature_names(feature_cols)
        )

        if self.expand_x is not None:
            new_cols = [f"fake_{self.expand_x-i}" for i in range(self.expand_x - features.shape[1])][::-1]
            for column in new_cols:
                features[column] = 0
        x = torch.tensor(features.values, dtype=torch.float)
        
        edges = funcs.connect_edges(df, column)
        edge_index = torch.tensor(
            edges[['source','target']].T.values, dtype=torch.long
        )

        return x, edge_index

    def process_y(self) -> Tensor:
        y_1 = torch.tensor([range(0,2000)])[0]
        y_2 = torch.tensor([range(0,2000)])[0]
        train_y = torch.stack([y_1, y_2], dim=0)
        return train_y

zeta_data = ZetaDataset(
    root="./data/",
    column=["url", "referrer"], 
    label=["zeta_user_id", "client_id"], 
    feature_cols=[sizmek_cols, zync_cols], 
    parse_url=False
)

In [None]:
print(zeta_data)

In [None]:
#sizmek_data = funcs.ZetaData(sizmek_path, "url", "zeta_user_id", sizmek_cols, parse_url=False)
sizmek_model = GCN(zeta_data)

# Load and Create Zync Data
#zync_data = funcs.ZetaData(zync_path, "referrer", "client_id", zync_cols, parse_url=False, expand_x=596)
zync_model = funcs.GCN(zync_data)


In [None]:
(self, root: str, column: str, target: str, feature_cols=None,
                 parse_url=True, expand_x=None, transform=None, pre_transform=None
        ):