# Graph Matching Consensus
This version (4) uses a custom `pytorch_geometric` Dataset.

## Import and Initialize

In [1]:
from typing import Optional, Callable, List, Dict, Tuple
import os

import dgmc
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import torch
from torch import Tensor
import funcs
import pytorch_lightning as pl
from torch.nn import Linear
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import torch_geometric


%reload_ext autoreload
%autoreload 2

device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Load Dataset

In [2]:
class ZetaDataset(torch_geometric.data.InMemoryDataset):
    def __init__(self, root: str, column: str, label: str, feature_cols=None,
                 parse_url=False, expand_x=None, transform=None, pre_transform=None
        ):
        self.root = root
        self.column = column
        self.label = label
        self.feature_cols = feature_cols
        self.parse_url = parse_url
        self.expand_x = expand_x
        super(ZetaDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
        print("Removing processed file. . .")
        os.remove(self.processed_paths[0])

    @property
    def raw_file_names(self) -> List[str]:
        return [
            "sizmek_bidstream_raw_20210625_10k.csv", 
            "zync_session_tracking_orc_20210625_10k.csv"
        ]

    @property
    def processed_file_names(self):
        return ["ZetaDataset.pt"]

    def download(self):
        pass

    def process(self):
        sizmek_path = os.path.join(self.root, self.raw_file_names[0])
        zync_path = os.path.join(self.root, self.raw_file_names[1])
        
        print("Loading", sizmek_path)
        x1, edge_index1 = self.process_graph(sizmek_path, self.column[0], self.feature_cols[0])
        x2, edge_index2 = self.process_graph(zync_path, self.column[1], self.feature_cols[1], 596)

        train_y = self.process_y()
        test_y = self.process_y()

        data = Data(x1=x1, edge_index1=edge_index1, x2=x2,
                    edge_index2=edge_index2, train_y=train_y,
                    test_y=test_y)
        torch.save(self.collate([data]), self.processed_paths[0])

    def process_graph(self, file_path, column: str, feature_cols: List, expand_x: int=None):
        print(f"Processing graph for {file_path} on {column}")
        df = pd.read_csv(file_path, low_memory=False)
        df.columns = [i.split(".")[1] for i in df.columns]

        # parse URLs
        if self.parse_url == True and column in ["url", "referrer"]:
            df[column] = df[column].apply(
                lambda x:urlparse(x).netloc if pd.notnull(x) else x
            )

        # Encode features
        feature_enc = OneHotEncoder(handle_unknown="ignore")
        features = pd.DataFrame(
            feature_enc.fit_transform(df[feature_cols]).toarray(), 
            columns=feature_enc.get_feature_names(feature_cols)
        )
        if expand_x is not None:
            print(f"Expanding X to {expand_x}")
            new_cols = [f"fake_{expand_x-i}" for i in range(expand_x - features.shape[1])][::-1]
            for col in new_cols:
                features[col] = 0
        x = torch.tensor(features.values, dtype=torch.float)
        
        edges = funcs.connect_edges(df, column)
        edge_index = torch.tensor(
            edges[['source','target']].T.values, dtype=torch.long
        )

        return x, edge_index

    def process_y(self) -> Tensor:
        y_1 = torch.tensor([range(0,2000)])[0]
        y_2 = torch.tensor([range(0,2000)])[0]
        train_y = torch.stack([y_1, y_2], dim=0)
        return train_y
    
    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(sizmek/zync)"

In [3]:
class SumEmbedding(object):
    def __call__(self, data):
        data.x1, data.x2 = data.x1.sum(dim=1), data.x2.sum(dim=1)
        return data


sizmek_cols = ["account_id", "referrer_url", "city_code",
               "state_code", "dma_code", "country_code"]
zync_cols = ["client", "user_agent_platform",
             "user_agent_language", "user_agent_browser"]


zeta_data = ZetaDataset(
    root="./data/",
    column=["url", "referrer"],
    label=["zeta_user_id", "client_id"],
    feature_cols=[sizmek_cols, zync_cols],
    parse_url=False,
    transform=SumEmbedding()
)
zeta_data

Processing...
Loading data/sizmek_bidstream_raw_20210625_10k.csv
Processing graph for data/sizmek_bidstream_raw_20210625_10k.csv on url
Create dict for url
Processing graph for data/zync_session_tracking_orc_20210625_10k.csv on referrer
Expanding X to 596


  features[col] = 0


Create dict for referrer
Done!
Removing processed file. . .


ZetaDataset(sizmek/zync)

In [4]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--dim', type=int, default=256)
parser.add_argument('--rnd_dim', type=int, default=32)
parser.add_argument('--num_layers', type=int, default=3)
parser.add_argument('--num_steps', type=int, default=10)
parser.add_argument('--k', type=int, default=10)
args = parser.parse_args("")


psi_1 = dgmc.models.RelCNN(zeta_data.data.x1.size(-1), args.dim, args.num_layers, batch_norm=False,
               cat=True, lin=True, dropout=0.5)
psi_2 = dgmc.models.RelCNN(args.rnd_dim, args.rnd_dim, args.num_layers, batch_norm=False,
               cat=True, lin=True, dropout=0.0)

psi_1

RelCNN(596, 256, num_layers=3, batch_norm=False, cat=True, lin=True, dropout=0.5)

In [5]:
model = dgmc.models.DGMC(psi_1, psi_2, num_steps=None, k=args.k).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
data = zeta_data.data

def train():
    model.train()
    optimizer.zero_grad()

    _, S_L = model(data.x1, data.edge_index1, None, None, data.x2,
                   data.edge_index2, None, None, data.train_y)

    loss = model.loss(S_L, data.train_y)
    loss.backward()
    optimizer.step()
    return loss

@torch.no_grad()
def test():
    model.eval()

    _, S_L = model(data.x1, data.edge_index1, None, None, data.x2,
                   data.edge_index2, None, None)

    hits1 = model.acc(S_L, data.test_y)
    hits10 = model.hits_at_k(10, S_L, data.test_y)

    return hits1, hits10

In [13]:
print('Optimize initial feature matching...')
model.num_steps = 0
for epoch in range(1, 6):
    if epoch == 3:
        print('Refine correspondence matrix...')
        model.num_steps = args.num_steps
        model.detach = True

    loss = train()

    #if epoch % 10 == 0 or epoch > 100:
    if True:
        hits1, hits10 = test()
        print((f'{epoch:03d}: Loss: {loss:.4f}, Hits@1: {hits1:.4f}, '
               f'Hits@10: {hits10:.4f}'))

Optimize initial feature matching...
S: torch.Size([10000, 596]), torch.Size([2, 2212814])
T: torch.Size([10000, 596]), torch.Size([2, 110023])
S: torch.Size([10000, 596]), torch.Size([2, 2212814])
T: torch.Size([10000, 596]), torch.Size([2, 110023])
001: Loss: 3.0940, Hits@1: 0.0000, Hits@10: 0.0025
S: torch.Size([10000, 596]), torch.Size([2, 2212814])
T: torch.Size([10000, 596]), torch.Size([2, 110023])
S: torch.Size([10000, 596]), torch.Size([2, 2212814])
T: torch.Size([10000, 596]), torch.Size([2, 110023])
002: Loss: 3.0814, Hits@1: 0.0000, Hits@10: 0.0030
Refine correspondence matrix...
S: torch.Size([10000, 596]), torch.Size([2, 2212814])
T: torch.Size([10000, 596]), torch.Size([2, 110023])
S: torch.Size([10000, 596]), torch.Size([2, 2212814])
T: torch.Size([10000, 596]), torch.Size([2, 110023])
003: Loss: 2.9844, Hits@1: 0.0010, Hits@10: 0.0040
S: torch.Size([10000, 596]), torch.Size([2, 2212814])
T: torch.Size([10000, 596]), torch.Size([2, 110023])
S: torch.Size([10000, 596]), 

KeyboardInterrupt: 

## Link Cookies

In [48]:
sizmek_path = "data/sizmek_bidstream_raw_20210825.csv"
zync_path = "data/zync_session_tracking_raw_20210825.csv"

sizmek_cols = ["account_id", "referrer_url", "city_code",
               "state_code", "dma_code", "country_code"]
zync_cols = ["client", "user_agent_platform",
             "user_agent_language", "user_agent_browser"]

# Load and Create Sizmek Data
sizmek_data = funcs.ZetaData(sizmek_path, "url", "zeta_user_id", sizmek_cols, parse_url=False)

# Load and Create Zync Data
zync_data = funcs.ZetaData(zync_path, "referrer", "client_id", zync_cols, parse_url=False, expand_x=596)

Filtering Zync for sizmek clients 99,999 -> 37,999


In [63]:
def check_len(x):
    return len(str(x))

sizmek_data.df['user_id'].apply(check_len).value_counts()

18    77995
17    20141
16     1749
15      114
Name: user_id, dtype: int64

In [50]:
zync_data.df["client_id"].apply(check_len).value_counts()

19    29809
18     8164
17       23
16        3
Name: client_id, dtype: int64

In [67]:
zync_data.df["client_id"].sample(5)

52460    1649795207585121800
19353    2809753598636800729
14107    1871597494038725103
5230     1871597497705077888
12660     875739028935293343
Name: client_id, dtype: object

In [68]:
sizmek_data.df["user_id"].sample(5)

96072    478178280695061649
18415     82157053064335430
5481      33954821492091540
616        3636596773267333
27965    125194804811511992
Name: user_id, dtype: int64

In [70]:
def append_z(x):
    return "zync::"+x

new_z = zync_data.df["session_id"].apply(append_z)
new_z.to_csv("data/zync_ids.csv")

In [72]:
zync_ids = set(zync_data.df["client_id"].apply(lambda x: x[1:]))

count = 0
for sizmek_id in sizmek_data.df["user_id"]:
    if str(sizmek_id) in zync_ids:
        count += 1
    
print(count)

0


## Labels

In [6]:
file_path = "s3://drose-sandbox/join_output"
labels_zync = pd.read_parquet(file_path)

In [8]:
labels_zync[labels_zync["destination"] == "sizmek"]

Unnamed: 0,session_id,client,client_id,source,cookie_source,destination,cookie_destination,last_updated
1,0001be29-f16b-4fe3-b83a-a5535ea2804f:159457497...,sizmek,969751667156161598,zync,0001be29-f16b-4fe3-b83a-a5535ea2804f:159457497...,sizmek,969751667156161598,2021-07-14 01:04:00.959
5,0001be29-f16b-4fe3-b83a-a5535ea2804f:159457497...,sizmek,969751667156161598,zync,0001be29-f16b-4fe3-b83a-a5535ea2804f:159457497...,sizmek,969751667156161598,2021-07-14 01:04:00.959
9,0001be29-f16b-4fe3-b83a-a5535ea2804f:159457497...,sizmek,969751667156161598,zync,0001be29-f16b-4fe3-b83a-a5535ea2804f:159457497...,sizmek,969751667156161598,2021-07-14 01:04:00.959
13,0001be29-f16b-4fe3-b83a-a5535ea2804f:159457497...,sizmek,969751667156161598,zync,0001be29-f16b-4fe3-b83a-a5535ea2804f:159457497...,sizmek,969751667156161598,2021-07-14 01:04:00.959
17,0001be29-f16b-4fe3-b83a-a5535ea2804f:159457497...,sizmek,969751667156161598,zync,0001be29-f16b-4fe3-b83a-a5535ea2804f:159457497...,sizmek,969751667156161598,2021-07-14 01:04:00.959
...,...,...,...,...,...,...,...,...
999942,78afed92-90be-4620-aec6-ee4aedf510d5:156502932...,sizmek,1192116895925446876,zync,78afed92-90be-4620-aec6-ee4aedf510d5:156502932...,sizmek,1192116895925446876,2021-07-20 01:13:36.584
999954,78afed92-90be-4620-aec6-ee4aedf510d5:156502932...,sizmek,1192116895925446876,zync,78afed92-90be-4620-aec6-ee4aedf510d5:156502932...,sizmek,1192116895925446876,2021-07-20 01:13:36.584
999966,78afed92-90be-4620-aec6-ee4aedf510d5:156502932...,sizmek,1192116895925446876,zync,78afed92-90be-4620-aec6-ee4aedf510d5:156502932...,sizmek,1192116895925446876,2021-07-20 01:13:36.584
999978,78afed92-90be-4620-aec6-ee4aedf510d5:156502932...,sizmek,1192116895925446876,zync,78afed92-90be-4620-aec6-ee4aedf510d5:156502932...,sizmek,1192116895925446876,2021-07-20 01:13:36.584


## SnowFlake

In [9]:
import sqlalchemy

class Snowflake:
    def __init__(self):
        with open(".creds") as f:
            creds = f.read().splitlines()
            creds = {
                "account": creds[0],
                "user": creds[1],
                "password": creds[2]
            }
        self.snowflake_credentials = creds

    def connection(self):
        uri = 'snowflake://{user}:{password}@{account}/'.format(**self.snowflake_credentials)
        return sqlalchemy.create_engine(uri)
    
with Snowflake().connection().begin() as conn:
    result = conn.execute(
        """SELECT 6,6
        """
    )
df = pd.DataFrame(result.fetchall())
df.columns = result.keys()
df

In [None]:
# Create a function to read from s3 using python:
def read_s3(path: str) -> pd.DataFrame:
    """
    Reads a csv file from s3 into a pandas dataframe.
    
    :param path: The s3 location of the file.
    :return: A pandas dataframe.
    """
    obj = s3.get_object(Bucket='data-eng-capstone', Key=path)
    return pd.read_csv(io.BytesIO(obj['Body'].read()))
# Read in the data from s3:
df = read_s3('data/raw/df_raw.csv')

In [12]:
pd.read_parquet("s3://zeta-dcp-prod-private-tables/datacloud_cookie_cookies_relations_links_export/", engine='pyarrow')

ImportError: Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.

In [18]:
import boto3

s3 = boto3.resource('s3')
bucket = s3.Bucket('zeta-dcp-prod-private-tables')
# Iterates through all the objects, doing the pagination for you. Each obj
# is an ObjectSummary, so it doesn't contain the body. You'll need to call
# get to get the whole body.
for obj in bucket.objects.all():
    key = obj.key
    body = obj.get()['Body'].read()

InvalidObjectState: An error occurred (InvalidObjectState) when calling the GetObject operation: The operation is not valid for the object's storage class

In [19]:
def list_files(s3, bucket, prefix):
    """
    List files in specific S3 URL

    :param s3: S3 resource
    :param bucket: S3 bucket name
    :param prefix: S3 key path
    :return: list of files in the S3 URL
    """
    keys = []
    dirs = []
    next_token = ''
    base_kwargs = {
        'Bucket':bucket,
        'Prefix':prefix,
    }
    while next_token is not None:
        kwargs = base_kwargs.copy()
        if next_token != '':
            kwargs.update({'ContinuationToken': next_token})
        results = s3.list_objects_v2(**kwargs)
        contents = results.get('Contents')
        for i in contents:
            k = i.get('Key')
            if k[-1] != '/':
                keys.append(k)
            else:
                dirs.append(k)
        next_token = results.get('NextContinuationToken')
    return keys, dirs

list_files(s3, "zeta-dcp-prod-private-tables", "datacloud_cookie_cookies_relations_links_export")

AttributeError: 's3.ServiceResource' object has no attribute 'list_objects_v2'

In [None]:
s3://zeta-dcp-prod-private-tables/datacloud_cookie_cookies_relations_links_export/