In [1]:
import pandas as pd
import numpy as np
import random

from ampligraph.evaluation import train_test_split_no_unseen

import config

In [2]:
TRAIN_SIZE = 0.85
TEST_SIZE = 0.1
VAL_SIZE = 0.05

#### Loading dataset .csv file

In [3]:
data_path = "./data/vn_dataset_version/vn_all_triple.csv"
dataset = pd.read_csv(data_path, dtype=object)
dataset.columns = ['subject', 'predicate', 'object']
dataset['label'] = ['1']*len(dataset)
dataset.head(5)


Unnamed: 0,subject,predicate,object,label
0,hỗ trợ phí vận chuyển combo 2 thuốc chữa cảm c...,có sản phẩm,thuốc chữa cảm cúm,1
1,fenty beauty - bắt sáng dạng lỏng liquid diamo...,có sản phẩm,bắt sáng,1
2,tinh dầu massage baby oil sesam street nhập kh...,có sản phẩm,tinh dầu massage,1
3,flycam,có tên gọi,jjrc x11,1
4,"bikini, đồ bơi nữ, bộ bikini phong cách hàn cạ...",có họa tiết,họa tiết hình học,1


#### Spliting to train, test, valid set
###### Using train_test_split_no_unseen funciton, that ensures the entities in test and val set are "seen" in the train set

In [4]:
DATASET_SIZE = len(dataset)
test_train_set, valid_set = train_test_split_no_unseen(dataset.to_numpy(dtype = str), test_size = int(VAL_SIZE * DATASET_SIZE), seed=config.SEED, allow_duplication=True)
train_set, test_set = train_test_split_no_unseen(test_train_set, test_size = int(TEST_SIZE * DATASET_SIZE), seed=config.SEED, allow_duplication=True)

In [5]:
len(dataset)

173021

##### Save to the .csv file

In [6]:
raw_train_df = pd.DataFrame(train_set)
raw_train_df.columns = ['subject', 'predicate', 'object', 'label']

raw_test_df = pd.DataFrame(test_set)
raw_test_df.columns = ['subject', 'predicate', 'object', 'label']

raw_val_df = pd.DataFrame(valid_set)
raw_val_df.columns = ['subject', 'predicate', 'object', 'label']


In [7]:
raw_train_file_path = "./data/vn_dataset_version/entity_and_title.txt"
raw_train_df.to_csv(raw_train_file_path, index=False)

raw_test_file_path = "./data/vn_dataset_version/raw_test.csv"
raw_test_df.to_csv(raw_test_file_path, index=False)

raw_val_file_path = "./data/vn_dataset_version/raw_val.csv"
raw_val_df.to_csv(raw_val_file_path, index=False)

#### Creating corruptions

In [8]:
TRAIN_CORRUPTION_RATIO = 5
TEST_CORRUPTION_RATIO = 1
VAL_CORRUPTION_RATIO = 1


In [9]:
# all triple of dataset 
npdataset = dataset.to_numpy(dtype = str)
all_triple = set([(h,r,t) for h,r,t,l in npdataset])

In [10]:
# all entites
with open("./data/vn_dataset_version/entity_and_title.txt", "r", encoding='utf8') as f:
    entities = set()
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        entities.add(line)
    if '' in entities:
        entities.remove('')

In [11]:
# create corruptions function
# data_arr : [[h,r,t,l]*len]
# entites : set()
# all_triple : set(tuple(h,r,t))
def create_corruption(data_arr, entities, all_triple, ratio):
    corrupt_arr = []
    for head, rel, tail, label in data_arr:
        # print(head, rel, tail)
        corr_label = 0
        for i in range(ratio):
            rnd = random.random()
            if rnd <= 0.5:
                # corrupting head
                tmp_head = ''
                while True:
                    tmp_ent_list = entities.copy()
                    tmp_ent_list.remove(head)
                    tmp_ent_list = list(tmp_ent_list)
                    tmp_head = random.choice(tmp_ent_list)
                    tmp_triple = (tmp_head, rel, tail)
                    if tmp_triple not in all_triple:
                        break                    
                corrupt_arr.append([tmp_head, rel, tail, corr_label])
            # end if
            else:
                # corrupting tail
                tmp_tail = ''
                while True:
                    tmp_ent_list = entities.copy()
                    tmp_ent_list.remove(tail)
                    tmp_ent_list = list(tmp_ent_list)
                    tmp_tail = random.choice(tmp_ent_list)
                    tmp_triple = (head, rel, tmp_tail)
                    if tmp_triple not in all_triple:
                        break
                corrupt_arr.append([head, rel, tmp_tail, corr_label])
            # end else 
        # end for
    #end for
    return np.array(corrupt_arr)
                    

##### Creating corruptions for each "set" with different true triple/corruption ratio


In [12]:
train_corrupt_arr = create_corruption(train_set, entities, all_triple, TRAIN_CORRUPTION_RATIO)
train_with_corr = np.concatenate((train_corrupt_arr, train_set))
train_with_corr = train_with_corr[train_with_corr[:, 0].argsort()]

test_corrupt_arr = create_corruption(test_set, entities, all_triple, TEST_CORRUPTION_RATIO)
test_with_corr = np.concatenate((test_corrupt_arr, test_set))
test_with_corr = test_with_corr[test_with_corr[:, 0].argsort()]

val_corrupt_arr = create_corruption(valid_set, entities, all_triple, VAL_CORRUPTION_RATIO)
val_with_corr = np.concatenate((val_corrupt_arr, valid_set))
val_with_corr = val_with_corr[val_with_corr[:, 0].argsort()]

In [13]:
train_with_corr_file_path = "./data/vn_dataset_version/train_with_corr.csv"
train_with_corr_df = pd.DataFrame(train_with_corr, columns = ['subject', 'predicate', 'object', 'label'])
train_with_corr_df.to_csv(train_with_corr_file_path, index=False)

test_with_corr_file_path = "./data/vn_dataset_version/test_with_corr.csv"
test_with_corr_df = pd.DataFrame(test_with_corr, columns = ['subject', 'predicate', 'object', 'label'])
test_with_corr_df.to_csv(test_with_corr_file_path, index=False)

val_with_corr_file_path = "./data/vn_dataset_version/val_with_corr.csv"
val_with_corr_df = pd.DataFrame(val_with_corr, columns = ['subject', 'predicate', 'object', 'label'])
val_with_corr_df.to_csv(val_with_corr_file_path, index=False)