In [1]:
import pandas as pd
import numpy as np
import pathlib
import os
import random
from sklearn.utils import check_random_state

In [2]:
df_train = pd.read_csv("training_1000.nt",sep = ' ', header = None)
df_unseen = pd.read_csv("unseen_entities_1000.txt",sep = ' ', header = None)
df_val = pd.read_csv("validation_1000.nt",sep = ' ', header = None)
df_test = pd.read_csv("test_1000.nt",sep = ' ', header = None)
df_aux = pd.read_csv("auxiliary_1000.nt",sep = ' ', header = None)

In [3]:
print("df_train.shape",df_train.shape)
print("df_unseen.shape",df_unseen.shape)
print("df_val.shape",df_val.shape)
print("df_test.shape",df_test.shape)
print("df_aux.shape",df_aux.shape)

df_train.shape (129743, 4)
df_unseen.shape (1000, 1)
df_val.shape (5000, 4)
df_test.shape (1000, 4)
df_aux.shape (10529, 4)


In [4]:
def create_mappings(X):
    unique_ent = np.unique(np.concatenate((X[:, 0], X[:, 2])))
    unique_rel = np.unique(X[:, 1])
    ent_count = len(unique_ent)
    rel_count = len(unique_rel)
    rel_to_idx = dict(zip(unique_rel, range(rel_count)))
    ent_to_idx = dict(zip(unique_ent, range(ent_count)))
    return rel_to_idx, ent_to_idx

In [5]:
def to_idx(X, ent_to_idx=None, rel_to_idx=None):
    
    x_idx_s = np.vectorize(ent_to_idx.get)(X[:, 0])
    x_idx_p = np.vectorize(rel_to_idx.get)(X[:, 1])
    x_idx_o = np.vectorize(ent_to_idx.get)(X[:, 2])

    return np.dstack([x_idx_s, x_idx_p, x_idx_o]).reshape((-1, 3))

In [6]:
def load_from_csv(folder_name, file_name, sep=' ', header=None):
    df = pd.read_csv(os.path.join(path, file_name),
                     sep=sep,
                     header=header,
                     names=None,
                     dtype=str)

    df = df.drop_duplicates()
    return df.as_matrix()

In [7]:
path = os.getcwd()
train = load_from_csv(path,'training_1000.nt')
aux = load_from_csv(path,'auxiliary_1000.nt')
val = load_from_csv(path,'validation_1000.nt')
# unseen = load_from_csv(path,'unseen_entities_1000.txt')
test = load_from_csv(path,'test_1000.nt')

  if __name__ == '__main__':


In [8]:
test

array([['<http://dbpedia.org/resource/Norris_Brown>',
        '<http://dbpedia.org/ontology/birthPlace>',
        '<http://dbpedia.org/resource/Iowa>', '.'],
       ['<http://dbpedia.org/resource/Julia_(Beatles_song)__Julia__1>',
        '<http://dbpedia.org/ontology/previousWork>',
        '<http://dbpedia.org/resource/Got_to_Get_You_into_My_Life>', '.'],
       ['<http://dbpedia.org/resource/William_Toomer>',
        '<http://dbpedia.org/ontology/restingPlace>',
        '<http://dbpedia.org/resource/Chicago>', '.'],
       ...,
       ['<http://dbpedia.org/resource/I_Will_Follow>',
        '<http://dbpedia.org/ontology/recordLabel>',
        '<http://dbpedia.org/resource/Columbia_Records>', '.'],
       ['<http://dbpedia.org/resource/Laurie_Cunningham>',
        '<http://dbpedia.org/ontology/team>',
        '<http://dbpedia.org/resource/Wimbledon_F.C.>', '.'],
       ['<http://dbpedia.org/resource/Eugene_Goldwasser>',
        '<http://dbpedia.org/ontology/birthPlace>',
        '<http

In [9]:
vocab = np.concatenate((train,aux,val))

In [10]:
rel_to_idx, ent_to_idx = create_mappings(vocab)

train = to_idx(train , ent_to_idx, rel_to_idx)
aux = to_idx(aux, ent_to_idx, rel_to_idx)
val = to_idx(val, ent_to_idx, rel_to_idx)
test = to_idx(test, ent_to_idx, rel_to_idx)

In [11]:
def generate_corruptions_for_fit(X, ent_to_idx=None, eta=1, rnd=None):

    all_entities = list(ent_to_idx.values())
    X_corr = []
    for x in X:
        for i in range(eta):
            e = all_entities[rnd.randint(0, len(all_entities) - 1)]
            if np.asscalar(rnd.rand(1, 1)) > 0.5:
                X_corr.append([e, x[1], x[2]])
            else:
                X_corr.append([x[0], x[1], e])

    return np.asarray(X_corr).reshape(-1, 3)

In [12]:
X_batches = np.array_split(test, 1)
X_batches

[array([[50276,    32, 33012],
        [37808,   279, 28351],
        [74408,   305, 15365],
        ...,
        [32060,   293, 16957],
        [40874,   359, 74561],
        [23768,    32, 12779]])]

In [13]:
neg_test = []
rnd = check_random_state(50)
for j in range(1):
    X_neg_b = generate_corruptions_for_fit(X_batches[j], eta=1, rnd = rnd, ent_to_idx = ent_to_idx)
    neg_test.append(X_neg_b)
neg_test[0]

array([[50276,    32, 14000],
       [37808,   279, 22637],
       [74408,   305, 55366],
       ...,
       [ 8269,   293, 16957],
       [40874,   359, 35044],
       [23768,    32, 49095]])

In [14]:
Y_batches = np.array_split(val, 1)
Y_batches

[array([[26138,   142, 54185],
        [19658,    13,  5618],
        [28082,    18, 67403],
        ...,
        [25329,   299, 25331],
        [51581,    89, 49535],
        [20113,   193, 45131]])]

In [15]:
neg_val = []
rnd = check_random_state(50)
for j in range(1):
    Y_neg_b = generate_corruptions_for_fit(Y_batches[j], eta=1, rnd = rnd, ent_to_idx = ent_to_idx)
    neg_val.append(Y_neg_b)
neg_val[0]

array([[26138,   142, 14000],
       [19658,    13, 22637],
       [28082,    18, 55366],
       ...,
       [25329,   299, 46392],
       [51581,    89, 19586],
       [20113,   193, 35223]])

In [16]:
path = os.getcwd()
pathlib.Path(path+'/with_negatives_test_val').mkdir(parents=True, exist_ok=True)
print("A New Directory is created, named: 'with_negatives_test_val'")

A New Directory is created, named: 'with_negatives_test_val'


In [17]:
path

'/Users/gurpreet.ag.singh/Desktop/dbpedia_1k_s (1)/schema and not schama'

In [18]:
type(neg_test[0])

numpy.ndarray

In [19]:
df_test = pd.DataFrame(neg_test[0],index=None, columns=None)
df_val = pd.DataFrame(neg_val[0],index=None, columns=None)

In [20]:
path = os.getcwd()+'/with_negatives_test_val/'
df_test.to_csv(path+'test_neg',sep='\t', header=False, index=False, encoding = "utf-8")
df_val.to_csv(path+'val_neg',sep='\t', header=False, index=False, encoding = "utf-8")

## ADDING 0, indicating all these are false set.

In [21]:
path

'/Users/gurpreet.ag.singh/Desktop/dbpedia_1k_s (1)/schema and not schama/with_negatives_test_val/'

In [22]:
val_neg = np.genfromtxt(path+"val_neg", dtype='int')
test_neg = np.genfromtxt(path+"test_neg", dtype='int')

In [23]:
print("val shape:", val_neg.shape)
print("test shape:", test_neg.shape)

val shape: (5000, 3)
test shape: (1000, 3)


In [24]:
val_zeros = np.zeros((5000,1), dtype=int)
test_zeros = np.zeros((1000,1), dtype=int)

In [25]:
val_n = np.concatenate((val_neg,val_zeros),axis=1)
test_n = np.concatenate((test_neg,test_zeros),axis=1)

In [26]:
np.savetxt('val_negatives', val_n, delimiter="\t", fmt="%d") 
np.savetxt('test_negatives', test_n, delimiter="\t", fmt="%d")