In [1]:
import numpy as np
import pandas as pd
import pathlib
import os

In [2]:
def create_mappings(X):
    """Create string-IDs mappings for entities and relations.

        Entities and relations are assigned incremental, unique integer IDs.
        Mappings are preserved in two distinct dictionaries,
        and counters are separated for entities and relations mappings.

    Parameters
    ----------
    X : ndarray, shape [n, 3]
        The triples to extract mappings.

    Returns
    -------
    rel_to_idx : dict
        The relation-to-internal-id associations
    ent_to_idx: dict
        The entity-to-internal-id associations.

    """
    unique_ent = np.unique(np.concatenate((X[:, 0], X[:, 2])))
    unique_rel = np.unique(X[:, 1])
    ent_count = len(unique_ent)
    rel_count = len(unique_rel)
    rel_to_idx = dict(zip(unique_rel, range(rel_count)))
    ent_to_idx = dict(zip(unique_ent, range(ent_count)))
    return rel_to_idx, ent_to_idx

In [3]:
def to_idx(X, ent_to_idx=None, rel_to_idx=None):
    """Convert statements (triples) into integer IDs.

    Parameters
    ----------
    X : ndarray
        The statements to be converted.
    ent_to_idx : dict
        The mappings between entity strings and internal IDs.
    rel_to_idx : dict
        The mappings between relation strings and internal IDs.
    Returns
    -------
    X : ndarray, shape [n, 3]
        The ndarray of converted statements.
    """

    x_idx_s = np.vectorize(ent_to_idx.get)(X[:, 0])
    x_idx_p = np.vectorize(rel_to_idx.get)(X[:, 1])
    x_idx_o = np.vectorize(ent_to_idx.get)(X[:, 2])

    return np.dstack([x_idx_s, x_idx_p, x_idx_o]).reshape((-1, 3))

In [4]:
def to_idx1(X, ent_to_idx=None, rel_to_idx=None):
    x_idx_s = np.vectorize(ent_to_idx.get)(X[:, 0])
    return np.dstack([x_idx_s]).reshape((-1, 1))

In [122]:
def load_from_csv(folder_name, file_name, sep=' ', header=None):
    df = pd.read_csv(os.path.join(path, file_name),
                     sep=sep,
                     header=header,
                     names=None,
                     dtype=str)

    df = df.drop_duplicates()
    return df.as_matrix()

In [123]:
path = os.getcwd()+'/Schema_and_Non/'
train_ns = load_from_csv(path,'train_ns')
train_s = load_from_csv(path,'train_s')

aux_ns = load_from_csv(path,'aux_ns')
aux_s = load_from_csv(path,'aux_s')

val_ns = load_from_csv(path,'val_ns')
val_s = load_from_csv(path,'val_s')

unseen = load_from_csv(path,'unseen_entities_1000.txt')
test = load_from_csv(path,'test_1000.nt')

  if __name__ == '__main__':


In [124]:
train_ns

array([['<http://dbpedia.org/resource/Albert_G._Burr>',
        '<http://dbpedia.org/ontology/restingPlace>',
        '<http://dbpedia.org/resource/Illinois>', '.'],
       ['<http://dbpedia.org/resource/Baggy_Trousers>',
        '<http://dbpedia.org/ontology/recordLabel>',
        '<http://dbpedia.org/resource/Stiff_Records>', '.'],
       ['<http://dbpedia.org/resource/Filipino_Americans>',
        '<http://dbpedia.org/ontology/populationPlace>',
        '<http://dbpedia.org/resource/Washington_(state)>', '.'],
       ...,
       ['<http://dbpedia.org/resource/Garret_FitzGerald>',
        '<http://dbpedia.org/ontology/termPeriod>',
        '<http://dbpedia.org/resource/Garret_FitzGerald__8>', '.'],
       ['<http://dbpedia.org/resource/Harold_Halse__4>',
        '<http://dbpedia.org/ontology/team>',
        '<http://dbpedia.org/resource/Manchester_United_F.C.>', '.'],
       ['<http://dbpedia.org/resource/Lucien_Muller>',
        '<http://dbpedia.org/ontology/careerStation>',
       

In [104]:
vocab = np.concatenate((train_ns,train_s,aux_ns,aux_s,val_ns,val_s))

In [105]:
rel_to_idx, ent_to_idx = create_mappings(vocab)

train_ns = to_idx(train_ns, ent_to_idx, rel_to_idx)
train_s = to_idx(train_s, ent_to_idx, rel_to_idx)

aux_ns = to_idx(aux_ns, ent_to_idx, rel_to_idx)
aux_s = to_idx(aux_s, ent_to_idx, rel_to_idx)

val_ns = to_idx(val_ns, ent_to_idx, rel_to_idx)
val_s = to_idx(val_s, ent_to_idx, rel_to_idx)

unseen = to_idx1(unseen, ent_to_idx, rel_to_idx)
test = to_idx(test, ent_to_idx, rel_to_idx)

In [106]:
# train_ns.shape

(104964, 3)

In [107]:
# train_s.shape

(24779, 3)

In [108]:
path = os.getcwd()
pathlib.Path(path+'/Seralised').mkdir(parents=True, exist_ok=True)
print("A New Directory is created, named: 'Seralised'")

A New Directory is created, named: 'Seralised'


In [109]:
# path = path+'/Seralised/'

In [110]:
path

'/Users/gurpreet.ag.singh/Desktop/dbpedia_1k_s (1)/schema and not schama'

In [111]:
# type(train_s)

In [112]:
df_train_s = pd.DataFrame(train_s,index=None, columns=None)
df_train_ns = pd.DataFrame(train_ns,index=None, columns=None)

df_aux_s = pd.DataFrame(aux_s,index=None, columns=None)
df_aux_ns = pd.DataFrame(aux_ns,index=None, columns=None)

df_val_s = pd.DataFrame(val_s,index=None, columns=None)
df_val_ns = pd.DataFrame(val_ns,index=None, columns=None)

df_test = pd.DataFrame(test,index=None, columns=None)
df_unseen = pd.DataFrame(unseen,index=None, columns=None)

In [113]:
# df_train_s.head()

Unnamed: 0,0,1,2
0,62119,380,250
1,9383,380,75797
2,14325,380,54
3,47268,380,600
4,35865,380,40


In [114]:
# df_train_ns.head()

Unnamed: 0,0,1,2
0,5952,305,32343
1,9267,293,63482
2,24794,269,72969
3,72444,249,47892
4,38302,39,13554


In [115]:
path = os.getcwd()+'/Seralised/'
df_train_s.to_csv(path+'train_s',sep=' ', header=False, index=False, encoding = "utf-8")
df_train_ns.to_csv(path+'train_ns',sep=' ', header=False, index=False, encoding = "utf-8")

df_aux_s.to_csv(path+'aux_s',sep=' ', header=False, index=False, encoding = "utf-8")
df_aux_ns.to_csv(path+'aux_ns',sep=' ', header=False, index=False, encoding = "utf-8")

df_val_s.to_csv(path+'val_s',sep=' ', header=False, index=False, encoding = "utf-8")
df_val_ns.to_csv(path+'val_ns',sep=' ', header=False, index=False, encoding = "utf-8")

df_test.to_csv(path+'test',sep=' ', header=False, index=False, encoding = "utf-8")
df_unseen.to_csv(path+'unseen',sep=' ', header=False, index=False, encoding = "utf-8")

print("All the seralised files are now saved inside Seralised folder")

All the seralised files are now saved inside Seralised folder


In [116]:
df_train_s.describe()

Unnamed: 0,0,1,2
count,24779.0,24779.0,24779.0
mean,38067.581944,380.0,18460.1946
std,21496.097299,0.0,32282.420443
min,1.0,380.0,1.0
25%,19607.0,380.0,85.0
50%,37119.0,380.0,405.0
75%,56672.0,380.0,643.0
max,75670.0,380.0,75797.0


In [117]:
df_train_s.isnull().values.any()

False