In [5]:
import json
from pathlib import Path

# read training file
training_file_path = Path('./train.json')
with open(training_file_path.absolute()) as file:
    training_data = json.load(file)

# read test file
test_file_path = Path('./test.json')
with open(test_file_path.absolute()) as file:
    test_data = json.load(file)

# read validation file
validation_file_path = Path('./dev.json')
with open(validation_file_path.absolute()) as file:
    validation_data = json.load(file)

In [6]:
print("Training data length: ", len(training_data))
print("Test data length: ", len(test_data))
print("Validation data length: ", len(validation_data))

Training data length:  7001
Test data length:  2007
Validation data length:  1000


In [7]:
# remove the 'answer' key for each training / test / validation data

def remove_query_tag(data):
    for i in range(len(data)):
        if 'answer' in data[i]:
            del data[i]['answer']
    return data

training_data = remove_query_tag(training_data)
test_data = remove_query_tag(test_data)
validation_data = remove_query_tag(validation_data)

print("Data sample: ", training_data[0])

Data sample:  {'query': '列举出鲁迅的一个别名可以吗？', 'cypher': "match (:ENTITY{name:'鲁迅'})<--(h)-[:Relationship{name:'别名'}]->(q) return distinct q.name limit 1"}


In [8]:
import time
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

# https://github.com/facebookresearch/fairseq/tree/nllb
models_dict = {
    'nllb-1.3B': 'facebook/nllb-200-1.3B',
    'nllb-3.3B': 'facebook/nllb-200-3.3B',
    'nllb-distilled-600M': 'facebook/nllb-200-distilled-600M',
    'nllb-distilled-1.3B': 'facebook/nllb-200-distilled-1.3B',
}


def load_model(model_name):
    print('\tLoading model: %s' % model_name)
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    model = AutoModelForSeq2SeqLM.from_pretrained(models_dict[model_name])
    tokenizer = AutoTokenizer.from_pretrained(models_dict[model_name])
    return (model, tokenizer, device)

model, tokenizer, device = load_model('nllb-distilled-1.3B')


  from .autonotebook import tqdm as notebook_tqdm


	Loading model: nllb-distilled-1.3B


In [9]:
import json

def translation(source, target, text):
    start_time = time.time()

    translator = pipeline('translation', model=model,
                          tokenizer=tokenizer, src_lang=source, tgt_lang=target, device=device)
    output = translator(text, max_length=800)

    end_time = time.time()

    output = output[0]['translation_text']
    result = {'inference_time': end_time - start_time,
              'source': source,
              'target': target,
              'result': output}
    return result


text = """
{
        "query": "获得省、厅级社会科学奖、科技进步奖10余项的高校有哪些主要院系？列举10类好吗？", 
        "cypher": "match (:ENTITY{name:'获得省、厅级社会科学奖、科技进步奖10余项。'})<-[:Relationship{name:'主要奖项'}]-(h),(h)-[:Relationship{name:'主要院系'}]->(p) return distinct p.name limit 10"
}
"""
result = translation("zho_Hans", 'eng_Latn', text)
print(json.loads(result['result']))

{'query': 'Which major departments of universities have won more than 10 provincial, departmental social science awards, and science and technology advancement awards? Is it a good list of 10 categories?', 'cypher': "match (: ENTITY{name:' received more than 10 provincial, departmental social science awards, and science and technology advancement awards . . .'}) <- [:Relationship{name:' major awards'}]-(h),(h)-[:Relationship{name:' major department'}]->(p) return distinct p.name limit 10"}


In [28]:
def translate(json_data: json):
    query_text = json_data['query']
    cypher_text = json_data['cypher']
    query_translate = translation("zho_Hans", 'eng_Latn', query_text)
    cypher_translate = translation("zho_Hans", 'eng_Latn', cypher_text)
    return {
        "query": query_translate['result'],
        "cypher": cypher_translate['result']
    }

result = translate({
    "query": "获得省、厅级社会科学奖、科技进步奖10余项的高校有哪些主要院系？列举10类好吗？",
    "cypher": "match (:ENTITY{name:'获得省、厅级社会科学奖、科技进步奖10余项。'})<-[:Relationship{name:'主要奖项'}]-(h),(h)-[:Relationship{name:'主要院系'}]->(p) return distinct p.name limit 10"
})
print(result)

{'query': 'Which of the major universities have won more than 10 provincial, hall-level social science awards, and technology advancement awards?', 'cypher': "match (:ENTITY{name:' received the provincial, departmental social science award, scientific and technological advancement award more than 10 times。'}) <- [:Relationship{name:' major award'}]-(h),(h) - [:Relationship{name:' major department'}]->(p) return distinct p.name limit 10"}


In [8]:
# process all data
from tqdm import tqdm

BATCH_LENGTH = 1
def process_all_data(data):
    new_data = list()
    for d in tqdm(data):
        new_data.append(translate(d))
    return new_data

In [9]:
# testing the code
test_code_data = process_all_data(training_data[0:100])
print(test_code_data)

100%|██████████| 100/100 [04:11<00:00,  2.52s/it]

[{'query': 'Can you name one of the aliases of Ruxin?', 'cypher': "match (:ENTITY{name:'Luxin'}) <--(h) -[:Relationship{name:'also known as '}]->(q) return distinct q.name limit 1"}, {'query': 'Do you know what the hardness difference is for the most commonly used 301-SH stainless steel tape?', 'cypher': "match ((p:ENTITY{name:'301SH stainless steel band'}) - [:Relationship{name:'hardness coefficient'}]-> (q) return q.name"}, {'query': 'How many people can you find in multiplayer online games?', 'cypher': "match (:ENTITY{name:'many people'}) <- [:Relationship{name:'number of players'}]- (mn) - [:Tag{name:'tag'}]->(n:ENTITY{name:'network game'}) return mn.name"}, {'query': 'Tell me what film is produced in mainland China?', 'cypher': "match (:ENTITY{name:'Mainland China'}) <- [:Relationship{name:'Place of manufacture'}]- (x) return distinct x.name limit 5"}, {'query': 'What are the names of the three-lined whale?', 'cypher': "match (:ENTITY{name:' three-line waveship'}) <- [:Relationshi




In [10]:
# works, now do all training data
# Note: this will take a long time (~ 5 hours on a Nvidia P100 GPU)
training_data_translated = process_all_data(training_data[100:])
print("training data length:", len(training_data_translated))

100%|██████████| 6901/6901 [5:11:09<00:00,  2.71s/it]   

training data length: 6901





In [11]:
test_data = process_all_data(test_data)
print("Test data length: ", len(test_data))

100%|██████████| 2007/2007 [1:36:07<00:00,  2.87s/it]  

Test data length:  2007





In [12]:
validation_data = process_all_data(validation_data)
print("Validation data length: ", len(validation_data))

100%|██████████| 1000/1000 [47:51<00:00,  2.87s/it] 

Validation data length:  1000





In [5]:
from pathlib import Path
import json
train_translated_path = Path('./train_translated.json')
test_translated_path = Path('./test_translated.json')
validation_translated_path = Path('./validation_translated.json')

In [None]:

json.dump(test_code_data + training_data_translated, open(train_translated_path, 'w'))
json.dump(test_data, open(test_translated_path, 'w'))
json.dump(validation_data, open(validation_translated_path, 'w'))

In [6]:
import json
training_data_translated = json.load(open(train_translated_path))
test_data = json.load(open(test_translated_path))
validation_data = json.load(open(validation_translated_path))

In [10]:
from neo4j import GraphDatabase
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "neo4j"))
driver.verify_connectivity()

def try_test_query(query: str):
    with driver.session() as session:
        command = "PROFILE " + query
        try:
            driver.execute_query(command)
            return True, ''
        except Exception as e:
            return False, e.args[0]

isSuccess, error = try_test_query("Match (n)")

print(isSuccess)
print(error)

False
Query cannot conclude with MATCH (must be a RETURN clause, an update clause, a unit subquery call, or a procedure call with no YIELD) (line 1, column 9 (offset: 8))
"PROFILE Match (n)"
         ^


In [11]:
import re
from tqdm import tqdm

def filter_data(data: list):
    error_data_index = dict()
    error_query = set()

    for i in tqdm(range(len(data))):
        d: dict = data[i]
        cypher_query = d.get('cypher')
        if not cypher_query:
            error_data_index[i] = d
            error_query.add(d.get("query"))
            continue
        
        # check for remaining untranslated chinese
        pattern = r'[^\x00-\x7F]'
        if re.search(pattern, cypher_query):
            re_translated_cypher = translate(d).get('cypher')
            if not re.search(pattern, re_translated_cypher):
                d['cypher'] = re_translated_cypher
            else:
                error_data_index[i] = d
                error_query.add(d.get("query"))
        
        # check if the cypher query is valid
        isSuccess, error = try_test_query(d['cypher'])
        if not isSuccess:
            error_query.add(d.get("query"))
            error_data_index[i] = d
            error_data_index[i]['error'] = error

    for d in data:
        if d.get("query") in error_query:
            data.remove(d)

    return error_data_index

In [12]:
def second_filter(data: list):
    new_list = list()

    for d in data:
        if d.get("error"):
            continue
        new_list.append(d)

    return new_list

In [10]:
error_data_train = filter_data(training_data_translated)
error_data_test = filter_data(test_data)
error_data_validation = filter_data(validation_data)

training_data_translated = second_filter(training_data_translated)
test_data = second_filter(test_data)
validation_data = second_filter(validation_data)

100%|██████████| 7001/7001 [29:58<00:00,  3.89it/s]  
 47%|████▋     | 951/2007 [04:26<02:40,  6.58it/s]Your input_length: 400 is bigger than 0.9 * max_length: 400. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
100%|██████████| 2007/2007 [08:56<00:00,  3.74it/s]
 84%|████████▍ | 842/1000 [04:23<00:26,  5.88it/s]Your input_length: 400 is bigger than 0.9 * max_length: 400. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
100%|██████████| 1000/1000 [05:18<00:00,  3.14it/s]


In [19]:
print("length of error data train: ", len(error_data_train))
print("length of error data test: ", len(error_data_test))
print("length of error data validation: ", len(error_data_validation))

length of error data train:  2913
length of error data test:  819
length of error data validation:  441


In [20]:
print("length of training data: ", len(training_data_translated))
print("length of test data: ", len(test_data))
print("length of validation data: ", len(validation_data))

length of training data:  4174
length of test data:  1213
length of validation data:  572


In [2]:
from pathlib import Path

training_data_validated_path = Path('./validated/train_translated_validated.json')
test_data_validated_path = Path('./validated/test_translated_validated.json')
validation_data_validated_path = Path('./validated/validation_translated_validated.json')

In [None]:
json.dump(training_data_translated, open(training_data_validated_path, 'w'))
json.dump(test_data, open(test_data_validated_path, 'w'))
json.dump(validation_data, open(validation_data_validated_path, 'w'))

In [13]:
from pathlib import Path

error_data_train_path = Path('./error_data_train.json')
error_data_test_path = Path('./error_data_test.json')
error_data_validation_path = Path('./error_data_validation.json')

In [None]:
json.dump(error_data_train, open(error_data_train_path, 'w'))
json.dump(error_data_test, open(error_data_test_path, 'w'))
json.dump(error_data_validation, open(error_data_validation_path, 'w'))

In [14]:
import json

error_data_train = json.load(open(error_data_train_path))
error_data_test = json.load(open(error_data_test_path))
error_data_validation = json.load(open(error_data_validation_path))

In [24]:
def extract_original_data_from_error(error_data: dict, data: list):
    new_data = list()
    for i in error_data.keys():
        new_data.append(data[int(i)])
    return new_data

error_data_train_original = extract_original_data_from_error(error_data_train, training_data)
error_data_test_original = extract_original_data_from_error(error_data_test, test_data)
error_data_validation_original = extract_original_data_from_error(error_data_validation, validation_data)

print("length of error data train original:", len(error_data_train_original))
print("length of error data test original:", len(error_data_test_original))
print("length of error data validation original:", len(error_data_validation_original))

print("data sample:", error_data_test_original[0])

length of error data train original: 2913
length of error data test original: 819
length of error data validation original: 441
data sample: {'query': '似乎有个日本军优叫乙夜，他是干哪行的？', 'cypher': "match (q:ENTITY{name:'乙夜[日本军优]'})-[:Relationship{name:'职业'}]-> (p) return  distinct p.name"}


In [17]:
import re
def extract_chinese_phrase(string: str):
    pattern = r'[\u4e00-\u9fff\u3000-\u303f\uff01-\uff0f\uff1a-\uff20\uff3b-\uff40\uff5b-\uff65]+'
    return re.findall(pattern, string)

print(extract_chinese_phrase("match (:ENTITY{name:'获得省、厅级社会科学奖、科技进步奖10余项。'})<-[:Relationship{name:'主要奖项'}]-(h),(h)-[:Relationship{name:'主要院系'}]->(p) return distinct p.name limit 10"))

['获得省、厅级社会科学奖、科技进步奖', '余项。', '主要奖项', '主要院系']


In [21]:
# v2 of the translation. Plus side is that this should preserve the cypher structure. Downside is that the translation is not as accurate as the previous one (due to lacking context)
def translate_v2(json_data: json):
    query_text = json_data['query']
    cypher_text = json_data['cypher']
    query_translate = translation("zho_Hans", 'eng_Latn', query_text)
    cypher_chinese_phrases = extract_chinese_phrase(cypher_text)
    for phrase in cypher_chinese_phrases:
        cypher_text = cypher_text.replace(phrase, translation("zho_Hans", 'eng_Latn', phrase).get('result'))
    return {
        "query": query_translate['result'],
        "cypher": cypher_text
    }

print(translate_v2({
    "query": "获得省、厅级社会科学奖、科技进步奖10余项的高校有哪些主要院系？列举10类好吗？",
    "cypher": "match (:ENTITY{name:'获得省、厅级社会科学奖、科技进步奖10余项。'})<-[:Relationship{name:'主要奖项'}]-(h),(h)-[:Relationship{name:'主要院系'}]->(p) return distinct p.name limit 10"
}))

{'query': 'Which of the major universities have won more than 10 provincial, hall-level social science awards, and technology advancement awards?', 'cypher': "match (:ENTITY{name:'He won the provincial, hall-level social science and technology awards.10The remainder of the item.'})<-[:Relationship{name:'Main awards'}]-(h),(h)-[:Relationship{name:'Main Faculty'}]->(p) return distinct p.name limit 10"}


In [23]:
from tqdm import tqdm

def process_all_data_v2(data):
    new_data = list()
    for d in tqdm(data):
        new_data.append(translate_v2(d))
    return new_data

In [25]:
error_data_train_original_translated = process_all_data_v2(error_data_train_original)
error_data_test_original_translated = process_all_data_v2(error_data_test_original)
error_data_validation_original_translated = process_all_data_v2(error_data_validation_original)

100%|██████████| 2913/2913 [1:18:17<00:00,  1.61s/it]
100%|██████████| 819/819 [21:02<00:00,  1.54s/it]
100%|██████████| 441/441 [12:07<00:00,  1.65s/it]


In [29]:
filter_data(error_data_train_original_translated)
filter_data(error_data_test_original_translated)
filter_data(error_data_validation_original_translated)

second_filter(error_data_train_original_translated)
second_filter(error_data_test_original_translated)
second_filter(error_data_validation_original_translated)

100%|██████████| 2913/2913 [18:01<00:00,  2.69it/s] 
100%|██████████| 819/819 [05:29<00:00,  2.49it/s]
 85%|████████▌ | 376/441 [01:54<00:24,  2.68it/s]Your input_length: 800 is bigger than 0.9 * max_length: 800. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 892 is bigger than 0.9 * max_length: 800. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
100%|██████████| 441/441 [03:10<00:00,  2.32it/s]


[{'query': "How many novels do you know about starting a girl's web?",
  'cypher': "match (:ENTITY{name:'The start of the Girl Network'})<-[:Relationship{name:'The first website'}]- (m) return m.name limit 3"},
 {'query': 'Which of the major universities have won more than 10 provincial, hall-level social science awards, and technology advancement awards?',
  'cypher': "match (:ENTITY{name:'He won the provincial, hall-level social science and technology awards.10The remainder of the item.'})<-[:Relationship{name:'Main awards'}]-(h),(h)-[:Relationship{name:'Main Faculty'}]->(p) return distinct p.name limit 10"},
 {'query': 'What are the three examples of a species having a tier 3 label relationship?',
  'cypher': "match (n)-[:Tag*3{name:'The label'}]->(p:ENTITY{name:'Biological species'}) RETURN distinct n.name limit 3"},
 {'query': 'What TV series have 30 episodes? Can I list 15?',
  'cypher': "match (:ENTITY{name:'30'})<-[:Relationship{name:'The set'}]- (x) return x.name limit 15"},
 

In [31]:
print(len(error_data_train_original_translated))
print(len(error_data_test_original_translated))
print(len(error_data_validation_original_translated))

2511
709
380


In [32]:
json.dump(error_data_train_original_translated, open('./validated/re_translated_training.json', 'w'))
json.dump(error_data_test_original_translated, open('./validated/re_translated_test.json', 'w'))
json.dump(error_data_validation_original_translated, open('./validated/re_translated_validate.json', 'w'))

In [5]:
import json

training_data = json.load(open(training_data_validated_path, 'r')) + json.load(open('./validated/re_translated_training.json', 'r'))
test_data = json.load(open(test_data_validated_path, 'r')) + json.load(open('./validated/re_translated_test.json', 'r'))
validation_data = json.load(open(validation_data_validated_path, 'r')) + json.load(open('./validated/re_translated_validate.json', 'r'))

In [12]:
import re

def remove_invalid_entry(data: list):
    new_data = list()
    for d in data:
        # check if contains chinese characters
        pattern = r'[^\x00-\x7F]'
        if re.search(pattern, d['cypher']):
            continue
        if re.search(pattern, d['query']):
            continue
        new_data.append(d)
    return new_data

training_data = remove_invalid_entry(training_data)
test_data = remove_invalid_entry(test_data)
validation_data = remove_invalid_entry(validation_data)


In [13]:
print("length of training data: ", len(training_data))
print("length of test data: ", len(test_data))
print("length of validation data: ", len(validation_data))

length of training data:  6562
length of test data:  1878
length of validation data:  936


In [14]:
json.dump(training_data, open('./validated/training_data.json', 'w'), indent=4)
json.dump(test_data, open('./validated/test_data.json', 'w'), indent=4)
json.dump(validation_data, open('./validated/validation_data.json', 'w'), indent=4)