# Create graph

In [None]:
# Setup
!pip install -U pyTigerGraph
!pip install -U pandas


In [None]:
# Imports
import pyTigerGraph as tg
import json
import pandas as pd
import os

# Connection parameters
hostName = "https://" # REPLACE WITH YOUR HOSTNAME
userName = "" # REPLACE WITH YOUR USERNAME
password = "" # REPLACE WITH YOUR PASSWORD 
conn = tg.TigerGraphConnection(host=hostName, username=userName, password=password)

print("Connected")

In [None]:
def erase():
  conn.gsql('''
  USE GRAPH Wikidata
  DELETE FROM Statement
  DELETE FROM subject
  DELETE FROM object
  DELETE FROM predicate
  ''')

  results = conn.gsql('''
  USE GLOBAL
  DROP GRAPH Wikidata
  DROP EDGE predicate
  DROP VERTEX Object
  DROP VERTEX Entity
  DROP VERTEX Relation
  DROP VERTEX Subject
  DROP VERTEX Resource
  DROP VERTEX Statement
  DROP VERTEX Predicate
  DROP EDGE subject
  DROP EDGE object
  ''')

def create():
  schema = '''
    USE GLOBAL
    CREATE VERTEX Resource(PRIMARY_ID id UINT, Name STRING) WITH primary_id_as_attribute="true"
    CREATE VERTEX Statement(PRIMARY_ID id UINT, Name STRING) WITH primary_id_as_attribute="true"
    CREATE VERTEX Predicate(PRIMARY_ID id UINT, Name STRING) WITH primary_id_as_attribute="true"
    CREATE DIRECTED EDGE object (FROM Statement, TO Resource)
    CREATE DIRECTED EDGE subject (FROM Statement, TO Resource)
    CREATE DIRECTED EDGE predicate (FROM Statement, TO Predicate)
  '''
  conn.gsql(schema)
  results = conn.gsql('CREATE GRAPH Wikidata(Resource, Statement, Predicate, object, subject, predicate)')

  results = conn.gsql('''
    USE GRAPH Wikidata
    BEGIN
    CREATE LOADING JOB load_predicate FOR GRAPH Wikidata {
    DEFINE FILENAME MyDataSource;  
    LOAD MyDataSource TO VERTEX Predicate VALUES($0, $1) USING SEPARATOR=",", HEADER="true", EOL="\\n", QUOTE="double";
    }
    END
    ''')
  print(results)

  results = conn.gsql('''
    USE GRAPH Wikidata
    BEGIN
    CREATE LOADING JOB load_entities FOR GRAPH Wikidata {
    DEFINE FILENAME MyDataSource;  
    LOAD MyDataSource TO VERTEX Resource VALUES($0, $1) USING SEPARATOR=",", HEADER="true", EOL="\\n", QUOTE="double";
    }
    END
    ''')
  print(results)

  results = conn.gsql('''
    USE GRAPH Wikidata
    BEGIN
    CREATE LOADING JOB load_statements FOR GRAPH Wikidata {
    DEFINE FILENAME MyDataSource;  
    LOAD MyDataSource TO VERTEX Statement VALUES($0, "") USING SEPARATOR=",", HEADER="true", EOL="\\n", QUOTE="double";
    LOAD MyDataSource TO EDGE subject VALUES($0, $1) USING SEPARATOR=",", HEADER="true", EOL="\\n", QUOTE="double";
    LOAD MyDataSource TO EDGE predicate VALUES($0, $2) USING SEPARATOR=",", HEADER="true", EOL="\\n", QUOTE="double";
    LOAD MyDataSource TO EDGE object VALUES($0, $3) USING SEPARATOR=",", HEADER="true", EOL="\\n", QUOTE="double";
    }
    END
    ''')
  print(results)
  
create()  

conn.graphname="Wikidata"
secret = conn.createSecret()
authToken = conn.getToken(secret)
authToken = authToken[0]

conn = tg.TigerGraphConnection(host=hostName, graphname="Wikidata", username=userName, password=password, apiToken=authToken)

## Download data

In [None]:
!wget https://www.dropbox.com/s/lnbhc8yuhit4wm5/wikidata5m_alias.tar.gz?dl=1 -O wikidata5m_alias.tar.gz
!wget https://www.dropbox.com/s/563omb11cxaqr83/wikidata5m_all_triplet.txt.gz?dl=1 -O wikidata5m_all_triplet.txt.gz
!gunzip ./wikidata5m_all_triplet.txt.gz
!tar -zxvf wikidata5m_alias.tar.gz

In [None]:
with open('./wikidata5m_relation.txt') as f:
  relations = [item.strip().split('\t') for item in f.readlines()]

with open('./wikidata5m_entity.txt') as f:
  entities = [item.strip().split('\t') for item in f.readlines()]  

In [None]:
def export_data(data, filename, per_page=None):
  rows = []
  for item in data:
    rows.append({
        'id': int(item[0][1:]),
        'value': item[1],
    })
  if per_page is None:
    pd.DataFrame(rows).to_csv(f'./{filename}.csv', index=False)
  else:
    # Split rows in pages of size per_page
    pages = [rows[i:i+per_page] for i in range(0, len(rows), per_page)]
    for i, page in enumerate(pages):
        pd.DataFrame(page).to_csv(f'./{filename}_{i}.csv', index=False)

## Loading predicates

In [None]:
export_data(relations, 'relations')

In [None]:
# Load the posts file wiht the 'load_posts' job
posts_file = './relations.csv'
results = conn.uploadFile(posts_file, fileTag='MyDataSource', jobName='load_predicate')
print(json.dumps(results, indent=2))

## Loading entities

In [None]:
os.makedirs('entities', exist_ok=True)
export_data(entities, 'entities/entities', per_page=5000)

In [None]:
from os import listdir
import tqdm
import time
def upload_data(folder, job, wait=2, start_from=0):
    files = listdir(folder)
    # Filter csv
    csv_files = [f for f in files if f.endswith('.csv')]
    # Upload
    count = 0
    for file in tqdm.tqdm(csv_files):   
        if count < start_from: 
          count+=1    
          continue
        posts_file = f'{folder}/{file}'
        results = conn.uploadFile(posts_file, fileTag='MyDataSource', jobName=job)
        #print('File', file)
        time.sleep(wait)
        count+=1

upload_data('./entities', 'load_entities')

## Loading facts triplets

In [None]:

import tqdm
import os

def save_triplets(folder, per_page=5000):    

    num_rows = os.popen('wc -l wikidata5m_all_triplet.txt').read().split()[0]    
    current_file_index = 0
    triplets_file = open(f'{folder}/triplet{current_file_index}.csv', 'a')
    triplets_file.write('statement,s,v,o\n')
    current_count = 0
    with open('./wikidata5m_all_triplet.txt') as f:
        for index, row in tqdm.tqdm(enumerate(f), total=int(num_rows)):
            if current_count == per_page:
                current_count = 0
                current_file_index += 1
                triplets_file.close()
                triplets_file = open(f'{folder}/triplet{current_file_index}.csv', 'a')
                triplets_file.write('statement,s,v,o\n')
                
            s, v, o = row.strip().split('\t')
            data = [
                str(index+1), s[1:],v[1:],o[1:]
            ]
            triplets_file.write(','.join(data)+'\n')
            current_count+=1

    # Add last batch
    triplets_file.close()

os.makedirs('triplets', exist_ok=True)
save_triplets('triplets', per_page=10000)

100%|██████████| 21354359/21354359 [00:48<00:00, 438841.63it/s]


In [None]:
upload_data('./triplets', 'load_statements', start_from=0, wait=10)