# Graph Matching Concensus

## Initialize and Import

In [1]:
import dgmc
import torch
import funcs
import pytorch_lightning as pl
from torch.nn import Linear
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data

%reload_ext autoreload
%autoreload 2

## Load Data

In [2]:
class GCN(pl.LightningModule):
    def __init__(self, dataset):
        super(GCN, self).__init__()
        torch.manual_seed(0)
        #self.conv1 = GCNConv(dataset.num_features, 4)
        self.conv1 = GCNConv(596, 4)
        self.conv2 = GCNConv(4, 4)
        self.conv3 = GCNConv(4, 2)
        self.classifier = Linear(2, dataset.num_classes)

        self.in_channels = dataset.num_features
        self.out_channels = dataset.num_classes

    def forward(self, x, edge_index, edge_attr=None):
        print("x: ", x.shape)
        print("edge_index: ", edge_index.shape)
        #print("self: ", self.)
        # mat1 and mat2 shapes cannot be multiplied (10000x9070 and 596x4)
        h = self.conv1(x, edge_index)
        h = h.tanh()
        h = self.conv2(h, edge_index)
        h = h.tanh()
        h = self.conv3(h, edge_index)
        h = h.tanh()  # Final GNN embedding space.

        # Apply a final (linear) classifier.
        out = self.classifier(h)

        return out, h


In [4]:
#x=torch.tensor(sizmek_features.values, dtype=torch.float),
#edge_index=torch.tensor(sizmek_edges[['source','target']].T.values, dtype=torch.long),
#y=labels,
#train_mask=sizmek["zeta_user_id"].isna()
    
#Data(x=torch.tensor(sizmek_features.values, dtype=torch.float))

In [5]:
sizmek_path = "data/sizmek_bidstream_raw_20210625_10k.csv"
zync_path = "data/zync_session_tracking_orc_20210625_10k.csv"

sizmek_cols = ["account_id", "referrer_url", "city_code", 
               "state_code", "dma_code", "country_code"]
zync_cols = ["client", "user_agent_platform",
             "user_agent_language", "user_agent_browser"]

# Load and Create Sizmek Data
sizmek_data = funcs.ZetaData(sizmek_path, "url", "zeta_user_id", sizmek_cols, parse_url=False)
sizmek_model = GCN(sizmek_data)

# Load and Create Zync Data
zync_data = funcs.ZetaData(zync_path, "referrer", "client_id", zync_cols, parse_url=False, expand_x=596)
zync_model = funcs.GCN(zync_data)


print(f"Sizmek: (nodes:{sizmek_data.node_count:,}, edges:{sizmek_data.edge_count:,})")
print(f"Zync: (nodes:{zync_data.node_count:,}, edges:{zync_data.edge_count:,})")

  features[column] = 0


Sizmek: (nodes:10,000, edges:2,212,814)
Zync: (nodes:10,000, edges:110,023)


In [100]:
psi_1 = dgmc.models.GIN(
    in_channels=sizmek_data.num_features,
    out_channels=256,
    num_layers=3
)

psi_2 = dgmc.models.GIN(
    in_channels=32,
    out_channels=32,
    num_layers=3
)

model = dgmc.DGMC(psi_1, psi_2, num_steps=None, k=10)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

# Create fake labels for testing
y_1 = torch.tensor([range(0,2000)])[0]
y_2 = torch.tensor([range(0,2000)])[0]
train_y = torch.stack([y_1, y_2], dim=0)



def train():
    model.train()
    optimizer.zero_grad()
    _, out = model.forward(
        x_s=sizmek_data.x,
        edge_index_s=sizmek_data.edge_index,
        edge_attr_s=None,
        batch_s=None,
        x_t=zync_data.x,
        edge_index_t=zync_data.edge_index,
        edge_attr_t=None,
        batch_t=None,
        y=None
    )
    loss = model.loss(out, train_y)
    loss.backward()
    optimizer.step()
    return loss

@torch.no_grad()
def test():
    model.eval()

    _, out = model.forward(
        x_s=sizmek_data.x,
        edge_index_s=sizmek_data.edge_index,
        edge_attr_s=None,
        batch_s=None,
        x_t=zync_data.x,
        edge_index_t=zync_data.edge_index,
        edge_attr_t=None,
        batch_t=None,
        y=None
    )

    hits1 = model.acc(out, train_y)
    hits10 = model.hits_at_k(10, out, train_y)

    return hits1, hits10

In [None]:
print('Optimize initial feature matching...')
model.num_steps = 0
for epoch in range(1, 201):
    print("Epoch: ", epoch)
    if epoch == 5:
        print('Refine correspondence matrix...')
        model.num_steps = 3
        model.detach = True

    loss = train()

    #if epoch % 10 == 0 or epoch > 100:
    if True:
        hits1, hits10 = test()
        print((f'{epoch:03d}: Loss: {loss:.4f}, Hits@1: {hits1:.4f}, '
               f'Hits@10: {hits10:.4f}'))

Optimize initial feature matching...
Epoch:  1
001: Loss: 4.0926, Hits@1: 0.0015, Hits@10: 0.0055
Epoch:  2
002: Loss: 6.8640, Hits@1: 0.0005, Hits@10: 0.0055
Epoch:  3
003: Loss: 2.3019, Hits@1: 0.0010, Hits@10: 0.0045
Epoch:  4
004: Loss: 2.3026, Hits@1: 0.0005, Hits@10: 0.0045
Epoch:  5
Refine correspondence matrix...
005: Loss: 2.4160, Hits@1: 0.0000, Hits@10: 0.0060
Epoch:  6


------------

# Sratch Pad

In [48]:
sizmek_data.df[["user_id","zeta_user_id"]]

Unnamed: 0,user_id,zeta_user_id
0,163543465731194,
1,163543465731194,
2,163543465731194,
3,163543465731194,
4,163543465731194,
...,...,...
9995,57052281873774128,fa3e335a-6b70-45b0-a9fb-ac71d098a924:161377748...
9996,57052281873774128,fa3e335a-6b70-45b0-a9fb-ac71d098a924:161377748...
9997,57052281873774128,fa3e335a-6b70-45b0-a9fb-ac71d098a924:161377748...
9998,57052281873774128,fa3e335a-6b70-45b0-a9fb-ac71d098a924:161377748...


In [45]:
zync_data.df[["client","client_id"]].head(5)

Unnamed: 0,client,client_id
0,disqus,c3rj9u181f91uc2
1,sizmek,3710029932679905832
2,sizmek,875739027850277969
3,sizmek,2159827869768437528
4,sizmek,1783777305440028281


In [55]:
zync_data.df

Unnamed: 0,session_id,client,client_id,remote_addr,referrer,user_agent_platform,user_agent_language,user_agent_browser,user_agent_string,user_agent_version,timestamp,geo_data,partner_data,session,consent_data,dt
0,042daa1a-aa73-47f0-89f4-5ead6821767e:161417230...,disqus,c3rj9u181f91uc2,128.201.40.29,https://www.biorxiv.org/content/10.1101/560755...,windows,,chrome,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,91.0.4472.114,1.624644e+09,"{""country"":""BR"",""subdivision"":""RJ"",""coordinate...",,"{""clients"":{""appnexus"":null,""disqus"":""c3rj9u18...","{""gdpr"":null,""gdpr_consent"":null}",2021-06-25-18
1,61abba88-212a-408c-b274-f60d0cc738c1:161770412...,sizmek,3710029932679905832,177.76.103.102,https://123963fbfc0b24fafd1fe0c9e0b835ab.safef...,windows,,chrome,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,91.0.4472.114,1.624644e+09,"{""country"":""BR"",""subdivision"":""SP"",""coordinate...",,"{""clients"":{""appnexus"":null,""disqus"":""c743jlde...","{""gdpr"":null,""gdpr_consent"":null}",2021-06-25-18
2,bbceeddf-34a0-4d1b-916d-346ad02f0237:162464399...,sizmek,875739027850277969,87.122.112.108,https://www.intimissimi.com,iphone,,webkit,Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like M...,8611,1.624644e+09,"{""country"":""DE"",""subdivision"":""HE"",""coordinate...",,"{""clients"":null}","{""gdpr"":null,""gdpr_consent"":null}",2021-06-25-18
3,d37b370a-84ba-490e-8e9b-4b95542426c9:162456634...,sizmek,2159827869768437528,178.139.239.122,https://www.bebesymas.com/recien-nacido/ideas-...,android,11,chrome,Mozilla/5.0 (Linux; Android 11; M2007J20CG) Ap...,91.0.4472.101,1.624644e+09,"{""country"":""ES"",""subdivision"":""GA"",""coordinate...",,"{""clients"":{""appnexus"":null,""disqus"":null,""siz...","{""gdpr"":null,""gdpr_consent"":null}",2021-06-25-18
4,72e0ceb1-e459-450a-acbb-8c29e490a0ad:154663933...,sizmek,1783777305440028281,52.144.115.231,https://witl.com/listen-live/,windows,,chrome,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,91.0.4472.114,1.624644e+09,"{""country"":""US"",""subdivision"":""MI"",""coordinate...",,"{""clients"":{""appnexus"":""8496891724937171129"",""...","{""gdpr"":null,""gdpr_consent"":null}",2021-06-25-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,826d06f2-6041-4c4f-847c-eab976fee4f0:1624644002.5,sizmek,2159827871938477097,85.3.229.224,https://www.falconeri.com/ch/it/donna/summer_s...,iphone,,webkit,Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) ...,8611,1.624644e+09,"{""country"":""CH"",""subdivision"":""TI"",""coordinate...",,"{""clients"":null}","{""gdpr"":null,""gdpr_consent"":null}",2021-06-25-18
9996,f6e9667c-8d56-4066-8a4a-32ce56b28eed:161421969...,sizmek,1975180283972728695,107.214.238.17,https://secure1.xb-online.com/fe/MWP_MultiView...,chromeos,,chrome,Mozilla/5.0 (X11; CrOS x86_64 13816.55.0) Appl...,90.0.4430.86,1.624644e+09,"{""country"":""US"",""subdivision"":""CA"",""coordinate...",,"{""clients"":{""appnexus"":null,""disqus"":""c3175shm...","{""gdpr"":null,""gdpr_consent"":null}",2021-06-25-18
9997,78d3480d-9996-4898-bf7c-758fb537bc47:161171639...,disqus,c1ipjij836q47k1,189.190.24.194,https://www3.animeflv.net/ver/kobayashi-san-ch...,android,10,chrome,Mozilla/5.0 (Linux; Android 10; Nokia 5.1) App...,87.0.4280.101,1.624644e+09,"{""country"":""MX"",""subdivision"":""PUE"",""coordinat...",,"{""clients"":{""appnexus"":null,""disqus"":""c1ipjij8...","{""gdpr"":null,""gdpr_consent"":null}",2021-06-25-18
9998,189a1f3a-17ea-4164-817e-d4224074ffb4:162454615...,disqus,c46m0jf320kdmkd,167.249.168.254,https://www.animesgratisbr.biz/assistir/one-pi...,android,10,chrome,Mozilla/5.0 (Linux; Android 10; SAMSUNG SM-A30...,87.0.4280.141,1.624644e+09,"{""country"":""BR"",""subdivision"":""MA"",""coordinate...",,"{""clients"":{""appnexus"":null,""disqus"":""c46m0jf3...","{""gdpr"":null,""gdpr_consent"":null}",2021-06-25-18


In [53]:
count = 0
for sizmek_id in sizmek_data.df["user_id"]:
    break
    if str(sizmek_id) in zync_data.df["client_id"]:
        print(sizmek_id)
        count += 1


In [54]:
count

0