### Load libr

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# standard libr
import pandas as pd
import numpy as np
import shutil
import os
from tqdm import tqdm
import glob
import pickle

In [2]:
## csv files
dataset_table_dir_path = "data/datasets"
datasets_path = [os.path.join(dataset_name, 'exp_data') for dataset_name in os.listdir(dataset_table_dir_path)]
datasets_name = [x.split('\\')[0] for x in datasets_path]

In [3]:
datasets_name

['abt_buy_exp_data',
 'dirty_dblp_acm_exp_data',
 'dirty_dblp_scholar_exp_data',
 'dirty_itunes_amazon_exp_data',
 'dirty_walmart_amazon_exp_data']

### Load data

In [4]:
dir_path = 'embeddings/train_test'

train_path = os.path.join(dir_path, datasets_name[0], 'train')
train_files = glob.glob(f'{train_path}/*.pt')
valid_path = os.path.join(dir_path, datasets_name[0], 'valid')
valid_files = glob.glob(f'{valid_path}/*.pt')

train = pd.read_csv(os.path.join('data/datasets', datasets_name[0], 'exp_data/train.csv'))
valid = pd.read_csv(os.path.join('data/datasets', datasets_name[0], 'exp_data/valid.csv'))

In [5]:
valid[valid.rtable_id == 716]

Unnamed: 0,ltable_id,rtable_id,label
716,733,716,0
934,619,716,0
1764,561,716,0
1778,715,716,0
1786,546,716,0
1830,713,716,0


In [6]:
train_mappingA = pd.read_pickle('embeddings/train_test/mappings/mapping_split_train_ds_abt_buy_table_tableA_with_labels_0.pickle')
train_mappingA = {v:k for k,v in train_mappingA.items()}

train_mappingB = pd.read_pickle('embeddings/train_test/mappings/mapping_split_train_ds_abt_buy_table_tableB_with_labels_0.pickle')
train_mappingB = {v:k for k,v in train_mappingB.items()}

valid_mappingA = pd.read_pickle('embeddings/train_test/mappings/mapping_split_valid_ds_abt_buy_table_tableA_with_labels_0.pickle')
valid_mappingA = {v:k for k,v in valid_mappingA.items()}

valid_mappingB = pd.read_pickle('embeddings/train_test/mappings/mapping_split_valid_ds_abt_buy_table_tableB_with_labels_0.pickle')
valid_mappingB = {v:k for k,v in valid_mappingB.items()}

In [7]:
tableA_train = torch.from_numpy(torch.load(train_files[0]))
tableB_train = torch.from_numpy(torch.load(train_files[1]))

tableA_valid = torch.from_numpy(torch.load(valid_files[0]))
tableB_valid = torch.from_numpy(torch.load(valid_files[1]))

In [8]:
tableA_train.shape, tableB_train.shape, tableA_valid.shape, tableB_valid.shape

(torch.Size([973, 4096]),
 torch.Size([956, 4096]),
 torch.Size([728, 4096]),
 torch.Size([702, 4096]))

In [9]:
X_tr = train.apply(lambda pair: torch.concat([tableA_train[train_mappingA[pair['ltable_id']]],
                                                    tableB_train[train_mappingB[pair['rtable_id']]]]), axis = 1)
X_tr = torch.stack(list(X_tr.values), axis = 0)

X_val = valid.apply(lambda pair: torch.concat([tableA_valid[valid_mappingA[pair['ltable_id']]],
                                                  tableB_valid[valid_mappingB[pair['rtable_id']]]]), axis = 1)
X_val = torch.stack(list(X_val.values), axis = 0)

y_tr = torch.tensor(train.label)
y_val = torch.tensor(valid.label)

In [10]:
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

(torch.Size([5743, 8192]),
 torch.Size([1916, 8192]),
 torch.Size([5743]),
 torch.Size([1916]))

In [95]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [96]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_tr, y_tr) 
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers = 15) 

val_dataset = TensorDataset(X_val, y_val) 
val_dataloader = DataLoader(val_dataset, batch_size=128, num_workers = 15) 

In [97]:
from lightning.pytorch import Trainer, seed_everything
import torch.optim as optim
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
from sklearn.metrics import f1_score

In [103]:
class MyLitModel(pl.LightningModule):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.model = Model(input_size, hidden_size, output_size)
        self.loss_fn = nn.BCELoss()
        
    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        out = self(x)
        loss = self.loss_fn(out, y.view(-1, 1))
        self.log('train_loss', loss)
        f1 = f1_score( (out>=0.5).int().cpu().numpy(), y.cpu().numpy())
        self.log("train_f1", f1)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        out = self(x)
        loss = self.loss_fn(out, y.view(-1, 1))
        self.log("val_loss", loss)
        f1 = f1_score( (out>=0.5).int().cpu().numpy(), y.cpu().numpy())
        self.log("val_f1", f1)
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

In [104]:
early_stopping = EarlyStopping('val_f1', patience=5, mode = 'max')
trainer = pl.Trainer(max_epochs=50, devices=1, accelerator="gpu", callbacks=early_stopping)
model = MyLitModel(input_size = 2*4096, hidden_size = 512, output_size = 1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train_dataloader, val_dataloader)