In [1]:
import torch
import hyperjson as json
import os

In [13]:
from csv import DictWriter
import hyperjson as json
import os
import pickle
import torch
from torch.utils.data import DataLoader, Dataset
from typing import Dict, List


class GenwikiEntities:
    scopes = ['fine', 'full']
    modes = ['train', 'test']

    def __init__(self):
        self.file_dir = os.getcwd() + '/genwiki'
        self.entities = {}
        self.relationships = {}
        self.r_idx = 0
        self.e_idx = 0
        self.triples = set()

    def update_relationships(self, rels: List[str]):
        for r in rels:
            if r not in self.relationships:
                self.relationships.update({r: self.r_idx})
                self.r_idx += 1
        
    def update_entities(self, ents: List[str]):
        for e in ents:
            if e not in self.entities:
                self.entities.update({e: self.e_idx})
                self.e_idx += 1

    def _load_data(self, mode, scope=None):
        data = []
        fp = f'{self.file_dir}/{mode}'
        if scope:
            fp += f'/{scope}' 
        for file in os.scandir(fp):
            with open(file.path) as json_file:
                data += json.load(json_file)
        return data

    def triple_tensor(self, triple: List[str]):
        h_idx = self.entities[triple[0]]
        r_idx = self.relationships[triple[1]]
        t_idx = self.entities[triple[2]]
        return torch.tensor([h_idx, r_idx, t_idx], dtype=torch.int32)

    def write_entity_csv(self):
        with open(f'{self.file_dir}/entity.csv', mode='w') as csv_file:
            writer = DictWriter(csv_file, fieldnames=['entity', 'id'])
            writer.writeheader()
            writer.writerows([{'entity': e, 'id': i} for e, i in self.entities.items()])
    
    def write_relationship_csv(self):
        with open(f'{self.file_dir}/relationship.csv', mode='w') as csv_file:
            writer = DictWriter(csv_file, fieldnames=['relationship', 'id'])
            writer.writeheader()
            writer.writerows([{'relationship': e, 'id': i} for e, i in self.relationships.items()])

    def write_triple_tensors(self):
        with open(f'{self.file_dir}/triples.p', mode='wb') as pkl_file:
            pickle.dump(self.triples, pkl_file)

    def process_data(self):
        for mode in self.modes:
            if mode == 'train':
                for scope in self.scopes:
                    print(f'Processing the {mode} and {scope} data')
                    for data in self._load_data(mode, scope):
                        for triple in data['graph']:
                            self.update_relationships([triple[1]])
                            self.update_entities([triple[0], triple[2]])
                            self.triples.add(self.triple_tensor(triple))
            else:
                for data in self._load_data(mode):
                    for triple in data['graph']:
                        self.update_relationships([triple[1]])
                        self.update_entities([triple[0], triple[2]])
                        self.triples.add(self.triple_tensor(triple))
                
                
        self.write_entity_csv()
        self.write_relationship_csv()
        self.write_triple_tensors()

In [14]:
genwiki = GenwikiEntities()
genwiki.process_data()

Processing the train and fine data
Processing the train and full data


In [13]:
data[4504]

{'entities': ['Ukrainian',
  'Russian',
  'Sloviansk',
  'USSR',
  '13 January 1929',
  'Viktor Trokhymovych Fomin',
  '29 December 2007',
  'the year',
  '1970',
  '1950',
  'first'],
 'graph': [['Viktor Fomin', 'birthPlace', 'USSR'],
  ['Viktor Fomin', 'birthPlace', 'Sloviansk']],
 'id_long': {'graph_set_index': 2,
  'text_paragraph_index': 0,
  'text_sentence_index_end': 3,
  'text_sentence_index_start': 0,
  'wikipage': 'Viktor_Fomin'},
 'id_short': '["Viktor_Fomin", 2, [0, 0, 3]]',
 'text': '<ENT_5> ( <ENT_0> : , <ENT_1> : ; born <ENT_4> in <ENT_2> ; died <ENT_6> ) was a <ENT_0> football player . Master of Sports of the <ENT_3> ( <ENT_8> ) . The <ENT_10> <ENT_0> Player of <ENT_7> ( <ENT_9> ) .'}

In [14]:
relations = set()
for d in data:
    for g in d['graph']:
        relations.add(g[1])
        
for r in relations:
    print(r)

pushpinMapCaption
PopulatedPlace/areaTotal
division
data
w/l
candidate
recorded
militaryCommand
isPartOf
knownFor
Work/runtime
state
number
foundingYear
modes
attend
postalCodeType
album
club
recordedIn
creator
last
location
rd3Team
founded
race
manager
directedby
deathYear
network
opponent
time
rd1Score
strength
nationalteam
series
subsequentWork
region
address
language
established
aux
postalCode
demographics1Info
originalairdate
numberOfStudents
date
spouse
rd3Score
home
rd4Team
instrument
literaryGenre
architect
height
mile
city
ship
note
builder
score
body
coordDisplay
locatedInArea
team
previousWork
road
almaMater
siteStadium
Person/height
rd1Team
guest
owner
blankName
releaseDate
timezone
distributor
extra
clubs
cinematography
term
debutTeam
motto
areaCode
rebounds
birthDate
affiliation
product
class
added
conservationStatusSystem
birthPlace
source
party
goals
unitPref
overall
recordLabel
genre
wickets
label
editing
activeYearsEndYear
artist
rd2Score
company
routeEnd
developer
le

In [17]:
import torch

x = torch.arange(1.0, 5.0)
y = torch.arange(2.0, 6.0)
z = torch.arange(5.0, 9.0)


In [27]:
torch.outer(y, z)

tensor([[10., 12., 14., 16.],
        [15., 18., 21., 24.],
        [20., 24., 28., 32.],
        [25., 30., 35., 40.]])

In [34]:
x @ torch.outer(y, z)

tensor([200., 240., 280., 320.])

In [33]:
torch.matmul(x, torch.outer(x, y))

tensor([ 60.,  90., 120., 150.])

In [42]:
x @ torch.outer(y, z)

tensor([200., 240., 280., 320.])

In [41]:
torch.outer(x, y) @ torch.t(z)

tensor([ 96., 192., 288., 384.])

In [43]:
torch.outer(x, y) @ torch.diag(z)

tensor([[ 10.,  18.,  28.,  40.],
        [ 20.,  36.,  56.,  80.],
        [ 30.,  54.,  84., 120.],
        [ 40.,  72., 112., 160.]])

In [44]:
torch.diag(x) @ torch.outer(y, z)

tensor([[ 10.,  12.,  14.,  16.],
        [ 30.,  36.,  42.,  48.],
        [ 60.,  72.,  84.,  96.],
        [100., 120., 140., 160.]])

In [53]:
a = torch.tensor([[0], [0]])
torch.hstack((a, torch.vstack((x, y))))

tensor([[0., 1., 2., 3., 4.],
        [0., 2., 3., 4., 5.]])

In [87]:
a = torch.hstack((torch.tensor([10, 9, 8, 7, 6]).reshape((5, 1)), torch.tile(torch.as_tensor([1, 2, 3]), (5, 1))))

In [90]:
for i, j in enumerate(a):
    print(j)

tensor([10,  1,  2,  3])
tensor([9, 1, 2, 3])
tensor([8, 1, 2, 3])
tensor([7, 1, 2, 3])
tensor([6, 1, 2, 3])


In [92]:
a[3] = torch.tensor([0, 0, 0, 0])

In [97]:
torch.tile(torch.tensor([0, 0, 0, 0]), (1, 1))

tensor([[0, 0, 0, 0]])

In [96]:
torch.movedim(a, 0, 1)

tensor([[10,  9,  8,  0,  6],
        [ 1,  1,  1,  0,  1],
        [ 2,  2,  2,  0,  2],
        [ 3,  3,  3,  0,  3]])

In [102]:
torch.cat(torch.tensor([-1]), torch.tensor([0, 1, 2, 3, 4, 5]))

TypeError: cat() received an invalid combination of arguments - got (Tensor, Tensor), but expected one of:
 * (tuple of Tensors tensors, name dim, *, Tensor out)
 * (tuple of Tensors tensors, int dim, *, Tensor out)


In [104]:
torch.hstack((torch.tensor([-1]), torch.tensor([0, 2, 3, 4])))

tensor([-1,  0,  2,  3,  4])

In [107]:
torch.tensor([1, 2, 3, 4]).broadcast_to((5, 4))

tensor([[1, 2, 3, 4],
        [1, 2, 3, 4],
        [1, 2, 3, 4],
        [1, 2, 3, 4],
        [1, 2, 3, 4]])

In [117]:
b = torch.arange(0, 25).reshape((5, -1))

In [118]:
torch.randperm(5)

tensor([3, 4, 1, 0, 2])

In [123]:
torch.index_select(b, 1, torch.randperm(5))

tensor([[ 1,  4,  3,  0,  2],
        [ 6,  9,  8,  5,  7],
        [11, 14, 13, 10, 12],
        [16, 19, 18, 15, 17],
        [21, 24, 23, 20, 22]])

In [124]:
torch.index_select(torch.tensor([1, 2, 3, 4, 5]), 0, torch.tensor([2, 3]))

tensor([3, 4])

In [133]:
torch.split(b, 1, dim=1)

(tensor([[ 0],
         [ 5],
         [10],
         [15],
         [20]]),
 tensor([[ 1],
         [ 6],
         [11],
         [16],
         [21]]),
 tensor([[ 2],
         [ 7],
         [12],
         [17],
         [22]]),
 tensor([[ 3],
         [ 8],
         [13],
         [18],
         [23]]),
 tensor([[ 4],
         [ 9],
         [14],
         [19],
         [24]]))

In [127]:
b

tensor([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24]])