In [1]:
import pandas as pd
import numpy as np
import pathlib
import os
import random
from sklearn.utils import check_random_state

In [3]:
path = os.getcwd()+'/Original_db1k/'
path

'/Users/gurpreet.ag.singh/Desktop/Py/Success Tests/Original_db1k/'

In [4]:
df_train = pd.read_csv(path+"training_1000.nt",sep = ' ', header = None)
df_unseen = pd.read_csv(path+"unseen_entities_1000.txt",sep = ' ', header = None)
df_val = pd.read_csv(path+"validation_1000.nt",sep = ' ', header = None)
df_test = pd.read_csv(path+"test_1000.nt",sep = ' ', header = None)
df_aux = pd.read_csv(path+"auxiliary_1000.nt",sep = ' ', header = None)

In [5]:
print("df_train.shape",df_train.shape)
print("df_unseen.shape",df_unseen.shape)
print("df_val.shape",df_val.shape)
print("df_test.shape",df_test.shape)
print("df_aux.shape",df_aux.shape)

df_train.shape (129743, 4)
df_unseen.shape (1000, 1)
df_val.shape (5000, 4)
df_test.shape (1000, 4)
df_aux.shape (10529, 4)


In [6]:
string = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
df_train_s = df_train[df_train[1].str.contains(string)]
df_aux_s = df_aux[df_aux[1].str.contains(string)]
df_val_s = df_val[df_val[1].str.contains(string)]

In [7]:
print("df_train_s.shape",df_train_s.shape)
print("df_val_s.shape",df_val_s.shape)
print("df_aux_s.shape",df_aux_s.shape)

df_train_s.shape (24779, 4)
df_val_s.shape (1723, 4)
df_aux_s.shape (4474, 4)


In [8]:
# creating a dataframe of non schema entities as df_ns
df_train_ns = df_train.merge(df_train_s, indicator=True, how="left")[lambda x: x._merge=='left_only'].drop('_merge',1)
df_aux_ns = df_aux.merge(df_aux_s, indicator=True, how="left")[lambda x: x._merge=='left_only'].drop('_merge',1)
df_val_ns = df_val.merge(df_val_s, indicator=True, how="left")[lambda x: x._merge=='left_only'].drop('_merge',1)

In [9]:
print("df_train_ns.shape",df_train_ns.shape)
print("df_val_ns.shape",df_val_ns.shape)
print("df_aux_ns.shape",df_aux_ns.shape)

df_train_ns.shape (104964, 4)
df_val_ns.shape (3277, 4)
df_aux_ns.shape (6055, 4)


#### Seralising now

In [10]:
def create_mappings(X):
    unique_ent = np.unique(np.concatenate((X[:, 0], X[:, 2])))
    unique_rel = np.unique(X[:, 1])
    ent_count = len(unique_ent)
    rel_count = len(unique_rel)
    rel_to_idx = dict(zip(unique_rel, range(rel_count)))
    ent_to_idx = dict(zip(unique_ent, range(ent_count)))
    return rel_to_idx, ent_to_idx

In [11]:
def to_idx(X, ent_to_idx=None, rel_to_idx=None):
    x_idx_s = np.vectorize(ent_to_idx.get)(X[:, 0])
    x_idx_p = np.vectorize(rel_to_idx.get)(X[:, 1])
    x_idx_o = np.vectorize(ent_to_idx.get)(X[:, 2])

    return np.dstack([x_idx_s, x_idx_p, x_idx_o]).reshape((-1, 3))

In [12]:
train_ns = df_train_ns.values
train_s = df_train_s.values
aux_ns = df_aux_ns.values
aux_s = df_aux_s.values
val_ns = df_val_ns.values
val_s = df_val_s.values
unseen = df_unseen.values
test = df_test.values

In [13]:
# unseen

In [14]:
vocab = np.concatenate((train_ns,train_s,aux_ns,aux_s,val_ns,val_s,test))
# vocab = np.concatenate((train_ns,train_s,val_ns,val_s,test))

In [15]:
vocab.shape

(146272, 4)

In [16]:
rel_to_idx, ent_to_idx = create_mappings(vocab)
train_ns = to_idx(train_ns, ent_to_idx, rel_to_idx)
train_s = to_idx(train_s, ent_to_idx, rel_to_idx)
aux_ns = to_idx(aux_ns, ent_to_idx, rel_to_idx)
aux_s = to_idx(aux_s, ent_to_idx, rel_to_idx)
val_ns = to_idx(val_ns, ent_to_idx, rel_to_idx)
val_s = to_idx(val_s, ent_to_idx, rel_to_idx)
# unseen = to_idx1(unseen, ent_to_idx, rel_to_idx)
test = to_idx(test, ent_to_idx, rel_to_idx)

In [17]:
print("shape of train_ns", train_ns.shape)
print("shape of train_s", train_s.shape)
print("shape of val_ns", val_ns.shape)
print("shape of val_s", val_s.shape)
print("shape of aux_ns", aux_ns.shape)
print("shape of aux_s", aux_s.shape)
print("shape of test", test.shape)

shape of train_ns (104964, 3)
shape of train_s (24779, 3)
shape of val_ns (3277, 3)
shape of val_s (1723, 3)
shape of aux_ns (6055, 3)
shape of aux_s (4474, 3)
shape of test (1000, 3)


In [18]:
test

array([[50276,    32, 33012],
       [37808,   279, 28351],
       [74408,   305, 15365],
       ...,
       [32060,   293, 16957],
       [40874,   359, 74561],
       [23768,    32, 12779]])

### Generating corruptions for test (only) and validation(park) and labeling 

In [19]:
def generate_corruptions_for_fit(X, ent_to_idx=None, eta=1, rnd=None):
    all_entities = list(ent_to_idx.values())
    X_corr = []
    for x in X:
        for i in range(eta):
            e = all_entities[rnd.randint(0, len(all_entities) - 1)]
            if np.asscalar(rnd.rand(1, 1)) > 0.5:
                X_corr.append([e, x[1], x[2]])
            else:
                X_corr.append([x[0], x[1], e])
    return np.asarray(X_corr).reshape(-1, 3)

In [20]:
X_batches = np.array_split(test, 1)
X_batches

[array([[50276,    32, 33012],
        [37808,   279, 28351],
        [74408,   305, 15365],
        ...,
        [32060,   293, 16957],
        [40874,   359, 74561],
        [23768,    32, 12779]])]

In [21]:
neg_test = []
rnd = check_random_state(50)
for j in range(1):
    X_neg_b = generate_corruptions_for_fit(X_batches[j], eta=1, rnd = rnd, ent_to_idx = ent_to_idx)
    neg_test.append(X_neg_b)
neg_test[0]

array([[50276,    32, 14000],
       [37808,   279, 22637],
       [74408,   305, 55366],
       ...,
       [ 8269,   293, 16957],
       [40874,   359, 35044],
       [23768,    32, 49095]])

### Appending 0 to negative and 1 to positive set.

In [22]:
len(neg_test[0])

1000

In [23]:
test_zeros = np.zeros((1000,1), dtype=int)
test_n = np.concatenate((neg_test[0],test_zeros),axis=1)
test_n

array([[50276,    32, 14000,     0],
       [37808,   279, 22637,     0],
       [74408,   305, 55366,     0],
       ...,
       [ 8269,   293, 16957,     0],
       [40874,   359, 35044,     0],
       [23768,    32, 49095,     0]])

In [24]:
X_batches

[array([[50276,    32, 33012],
        [37808,   279, 28351],
        [74408,   305, 15365],
        ...,
        [32060,   293, 16957],
        [40874,   359, 74561],
        [23768,    32, 12779]])]

In [25]:
test_ones = np.ones((1000,1), dtype=int)
test_p = np.concatenate((X_batches[0],test_ones),axis=1)
test_p

array([[50276,    32, 33012,     1],
       [37808,   279, 28351,     1],
       [74408,   305, 15365,     1],
       ...,
       [32060,   293, 16957,     1],
       [40874,   359, 74561,     1],
       [23768,    32, 12779,     1]])

### Writing it into file now (NOT)

In [26]:
df_train_s = pd.DataFrame(train_s,index=None, columns=None)
df_train_ns = pd.DataFrame(train_ns,index=None, columns=None)
df_aux_s = pd.DataFrame(aux_s,index=None, columns=None)
df_aux_ns = pd.DataFrame(aux_ns,index=None, columns=None)
df_val_s = pd.DataFrame(val_s,index=None, columns=None)
df_val_ns = pd.DataFrame(val_ns,index=None, columns=None)
df_test_p = pd.DataFrame(test_p,index=None, columns=None)
df_test_n = pd.DataFrame(test_n,index=None, columns=None)
# df_test = pd.DataFrame(test,index=None, columns=None)
# df_unseen = pd.DataFrame(unseen,index=None, columns=None)

In [27]:
print("shape of train_ns", df_train_ns.shape)
print("shape of train_s", df_train_s.shape)
print("shape of val_ns", df_val_ns.shape)
print("shape of val_s", df_val_s.shape)
print("shape of aux_ns", df_aux_ns.shape)
print("shape of aux_s", df_aux_s.shape)
# print("shape of test", test.shape)

shape of train_ns (104964, 3)
shape of train_s (24779, 3)
shape of val_ns (3277, 3)
shape of val_s (1723, 3)
shape of aux_ns (6055, 3)
shape of aux_s (4474, 3)


### Combining the train + validate

In [28]:
df_train_s = df_train_s.append(df_val_s)
df_train_ns = df_train_ns.append(df_val_ns)
df_test = df_test_p.append(df_test_n)

In [29]:
path = os.getcwd()
pathlib.Path(path+'/comb').mkdir(parents=True, exist_ok=True)
print("A New Directory is created, named: 'comb'")

A New Directory is created, named: 'comb'


In [30]:
# To write with train + validate

# path = os.getcwd()+'/comb/'
# df_train_s.to_csv(path+'train_s',sep='\t', header=False, index=False, encoding = "utf-8")
# df_train_ns.to_csv(path+'train_ns',sep='\t', header=False, index=False, encoding = "utf-8")

# # NOT USING NOW
# # df_aux_s.to_csv(path+'aux_s',sep='\t', header=False, index=False, encoding = "utf-8")
# # df_aux_ns.to_csv(path+'aux_ns',sep='\t', header=False, index=False, encoding = "utf-8")
# # df_val_s.to_csv(path+'val_s',sep='\t', header=False, index=False, encoding = "utf-8")
# # df_val_ns.to_csv(path+'val_ns',sep='\t', header=False, index=False, encoding = "utf-8")

# df_test.to_csv(path+'test',sep='\t', header=False, index=False, encoding = "utf-8")
# # df_unseen.to_csv(path+'unseen',sep='\t', header=False, index=False, encoding = "utf-8")

# print("All the seralised files are now saved inside comb folder")

### Combining train + val + aux = train_all_ns and train_all_s

In [31]:
df_train_s = df_train_s.append(df_aux_s)
df_train_ns = df_train_ns.append(df_aux_ns)

In [32]:
print("shape of train_ns all combined", df_train_ns.shape)
print("shape of train_s all combined", df_train_s.shape)
print("shape of val_ns old", df_val_ns.shape)
print("shape of val_s old", df_val_s.shape)
print("shape of aux_ns old", df_aux_ns.shape)
print("shape of aux_s old", df_aux_s.shape)

shape of train_ns all combined (114296, 3)
shape of train_s all combined (30976, 3)
shape of val_ns old (3277, 3)
shape of val_s old (1723, 3)
shape of aux_ns old (6055, 3)
shape of aux_s old (4474, 3)


In [76]:
print("For non schema",104964 + 3277 + 6055)
print("For Schema",24779 + 1723 + 4474)

For non schema 114296
For Schema 30976


#### Making a complete set of Train:

In [34]:
df_train_all = df_train_ns.append(df_train_s)
df_train_all.shape

(145272, 3)

In [35]:
path = os.getcwd()+'/comb/'
df_train_s.to_csv(path+'train_s',sep='\t', header=False, index=False, encoding = "utf-8")
df_train_ns.to_csv(path+'train_ns',sep='\t', header=False, index=False, encoding = "utf-8")
df_test.to_csv(path+'test',sep='\t', header=False, index=False, encoding = "utf-8")
df_train_all.to_csv(path+'train_all',sep='\t', header=False, index=False, encoding = "utf-8")
print("All the seralised files are now saved inside comb folder")

All the seralised files are now saved inside comb folder


In [36]:
os.getcwd()

'/Users/gurpreet.ag.singh/Desktop/Py/Success Tests'