# Pre-setting if using Google Colab

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd drive/MyDrive/Graph_Link_Prediction/biggraph/

/content/drive/MyDrive/Graph_Link_Prediction/biggraph


# Training

In [None]:
! pip install torchbiggraph

In [55]:
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE.txt file in the root directory of this source tree.

import argparse
import random
from itertools import chain
from pathlib import Path

import attr
import pkg_resources

from torchbiggraph.config import add_to_sys_path, ConfigFileLoader
from torchbiggraph.converters.import_from_tsv import convert_input_data
from torchbiggraph.converters.utils import download_url, extract_gzip
from torchbiggraph.eval import do_eval
from torchbiggraph.train import train
from torchbiggraph.util import (
    set_logging_verbosity,
    setup_logging,
    SubprocessInitializer,
)


TRAIN_FILENAME = "train.txt"
TEST_FILENAME = "test.txt"
FILENAMES = [
    TRAIN_FILENAME,
    TEST_FILENAME,
]
TRAIN_FRACTION = 0.75

# Figure out the path where the sample config was installed by the package manager.
# This can be overridden with --config.
DEFAULT_CONFIG = '/content/drive/MyDrive/Graph_Link_Prediction/biggraph/config/config_50_epochs_ranking.py'


def random_split_file(fpath: Path) -> None:
    train_file = fpath.parent / TRAIN_FILENAME
    test_file = fpath.parent / TEST_FILENAME

    if train_file.exists() and test_file.exists():
        print("Found some files that indicate that the input data "
              "has already been shuffled and split, not doing it again.")
        print(f"These files are: {train_file} and {test_file}")
        return

    print('Shuffling and splitting train/test file. This may take a while.')

    print(f"Reading data from file: {fpath}")
    with fpath.open("rt") as in_tf:
        lines = in_tf.readlines()

    # The first few lines are comments
    #lines = lines[4:]
    print('Shuffling data')
    random.shuffle(lines)
    split_len = int(len(lines) * TRAIN_FRACTION)

    print('Splitting to train and test files')
    with train_file.open("wt") as out_tf_train:
        for line in lines[:split_len]:
            out_tf_train.write(line)

    with test_file.open("wt") as out_tf_test:
        for line in lines[split_len:]:
            out_tf_test.write(line)


def main():
    setup_logging()



    # download data
    path = './data/statml/'
    data_dir = Path(path)
    
    training_data = './data/statml/edges_no_duplicate.txt'
    fpath = Path(training_data)

    # random split file for train and test
    random_split_file(fpath)

    loader = ConfigFileLoader()
    config = loader.load_config(DEFAULT_CONFIG, overrides=None)
    set_logging_verbosity(config.verbose)
    subprocess_init = SubprocessInitializer()
    subprocess_init.register(setup_logging, config.verbose)
    subprocess_init.register(add_to_sys_path, loader.config_dir.name)
    input_edge_paths = [data_dir / name for name in FILENAMES]
    output_train_path, output_test_path = config.edge_paths

    convert_input_data(
        config.entities,
        config.relations,
        config.entity_path,
        config.edge_paths,
        input_edge_paths,
        lhs_col=0,
        rhs_col=1,
        rel_col=None,
        dynamic_relations=config.dynamic_relations,
    )

    train_config = attr.evolve(config, edge_paths=[output_train_path])
    train(train_config, subprocess_init=subprocess_init)

    eval_config = attr.evolve(config, edge_paths=[output_test_path])
    do_eval(eval_config, subprocess_init=subprocess_init)


if __name__ == "__main__":
    main()

Shuffling and splitting train/test file. This may take a while.
Reading data from file: data/statml/edges_no_duplicate.txt
Shuffling data
Splitting to train and test files
Using the 1 relation types given in the config
Searching for the entities in the edge files...
Entity type user_id:
- Found 3767 entities
- Removing the ones with fewer than 1 occurrences...
- Left with 3767 entities
- Shuffling them...
Preparing counts and dictionaries for entities and relation types:
- Writing count of entity type user_id and partition 0
Preparing edge path data/train_partitioned, out of the edges found in data/statml/train.txt
- Edges will be partitioned in 1 x 1 buckets.
- Processed 24052 edges in total
Preparing edge path data/test_partitioned, out of the edges found in data/statml/test.txt
- Edges will be partitioned in 1 x 1 buckets.
- Processed 8018 edges in total
2021-04-07 06:37:17,139   [Trainer-0] Loading entity counts...
2021-04-07 06:37:17,148   [Trainer-0] Creating workers...
2021-04-0

#Test

In [56]:
import json
import os
import h5py
import torch
from torchbiggraph.model import ComplexDiagonalDynamicOperator, DotComparator

with open("./data/statml/entity_names_user_id_0.json", "rt") as tf:
    dictionary = json.load(tf)

print(dictionary)
count = 0
def perform_prediction(source_id, target_id):
  try:
    offset_source = dictionary.index(source_id)
    #print("our offset for source_id " , source_id, " is: ", offset_source)
    offset_target = dictionary.index(target_id)
    #print("our offset for target_id " , target_id, " is: ", offset_target)
        
    with h5py.File("./model/statml_ranking_50_epochs_full_eval/embeddings_user_id_0.v20.h5", "r") as hf:
        embedding_source = hf["embeddings"][offset_source, :]
        embedding_target = hf["embeddings"][offset_target, :]


    comparator = DotComparator()
    comparator = DotComparator()
    score, _, _= comparator(
        comparator.prepare(torch.tensor(embedding_source.reshape([1,1,1024]))),
        comparator.prepare(torch.tensor(embedding_target.reshape([1,1,1024]))),
        torch.empty(1,0,1024),
        torch.empty(1,0,1024),
    )
    return score
  except:
    return 0.0


['3182', '117', '56', '1779', '629', '993', '461', '2837', '2900', '261', '2133', '3492', '201', '1306', '2783', '3691', '2753', '2110', '3862', '2049', '3031', '692', '3277', '3114', '1330', '958', '1794', '224', '3698', '2252', '2733', '245', '1021', '716', '372', '355', '2951', '431', '203', '4080', '1052', '1020', '3624', '1009', '792', '2939', '1', '3281', '1449', '1913', '320', '3547', '2927', '98', '906', '2439', '2157', '2907', '2574', '1661', '2854', '2852', '251', '2112', '3863', '1397', '813', '1284', '1795', '228', '2051', '470', '279', '3506', '1062', '1347', '3666', '3064', '1073', '890', '3815', '1041', '3054', '708', '4044', '3453', '2798', '2443', '2037', '2247', '1814', '1487', '2018', '4036', '3056', '753', '1067', '3069', '1260', '522', '4021', '1947', '2200', '101', '2570', '3581', '1629', '859', '3824', '1558', '3493', '23', '408', '3077', '353', '105', '1016', '2763', '1139', '742', '3582', '3967', '2150', '3386', '1454', '2933', '212', '4058', '3755', '2681', '3

In [57]:
# generate test samples:
test_samples = []
with open('data/test-public.csv', 'r') as f:
    next(f)
    for line in f:
        _, h, t = line.split(',')
        t = t[:len(t)-1]
        test_samples.append((h, t))

In [58]:
import numpy as np
def sigmoid(x):
    """ Sigmoid activation function
    :param x: scalar value
    :return: sigmoid activation
    """
    return 1 / (1 + np.exp(-x))
y_pred = []

n = len(test_samples)
i = 0
for i in range(0, n):
    sample = test_samples[i]
    #print(sample)
    pred = perform_prediction(str(sample[0]), str(sample[1]))
    #pred = pred.item()
    y_pred.append(pred)
    #print(pred)
    
    i += 1

proba = []
for element in y_pred:
    prob = sigmoid(element)
    proba.append(prob)



In [59]:

print(proba[:100])

print(proba[10])
final_res = []
for i in range(len(proba)):
    if proba[i] == 0.5:
        final_res.append(0.5)
        continue
    sz_test= proba[i][0][0].tolist()
    final_res.append(sz_test)

print(final_res)


[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
0.5
[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,

In [60]:


import csv
with open("DCN_2nd.csv","w",newline="") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["Id","Predicted"])
    test_id=1
    for prediction in final_res:
        writer.writerow([test_id,prediction])
        test_id+=1