In [2]:
from neo4j import GraphDatabase
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "neo4j"))
driver.verify_connectivity()

In [3]:
def get_items(name: str):
    with driver.session() as session:
        items = list()
        result = driver.execute_query(f"match (n:{name}) return n.name")
        for r in result.records:
            if r.get("n.name") is not None:
                items.append(r.get("n.name"))
    
    return items

player_list = get_items("Player")
club_list = get_items("Club")
league_list = get_items("League")

In [4]:
print("Length of player list:", len(player_list))
print("Length of club list:",len(club_list))
print("Length of League list:", len(league_list))

Length of player list: 68599
Length of club list: 264
Length of League list: 35


In [5]:
from pathlib import Path
import json

sentence_template_path = Path("./templates/sentence_template.json")
cypher_template_path = Path("./templates/cypher_template.json")

sentence_template = json.load(open(sentence_template_path))
cypher_template = json.load(open(cypher_template_path))

In [15]:
import random
import re

class Sampler:
    def __init__(self, player_list = list(), clubs_list = list(), league_list = list()) -> None:
        self.sampled_player_sentence = dict()
        self.player_set: set = set(player_list).copy()
        self.club_set: set = set(clubs_list).copy()
        self.league_set: set = set(league_list).copy()
    
    def sample(self, tag: str, sentence_id: int, sentence_hash: str):
        extract_pattern = r'<(\w+)_(\w+)>'
        tag = re.match(extract_pattern, tag)
        
        tag_type = tag.group(1).lower()

        if tag_type == "player":
            return self.__sample_from_list(self.player_set, sentence_id, sentence_hash, tag_type)
        elif tag_type == "club":
            return self.__sample_from_list(self.club_set, sentence_id, sentence_hash, tag_type)
        elif tag_type == "league":
            return self.__sample_from_list(self.league_set, sentence_id, sentence_hash, tag_type)
        else:
            raise ValueError("Invalid tag type")
    
    def __sample_from_list(self, sample_set: set, sentence_id: int, sentence_hash: str, tag_type: str):
        already_sampled_item_dict: dict = self.sampled_player_sentence.get(sentence_id, dict())
        dict_hash = hash(sentence_hash) ^ hash(tag_type)


        already_sampled_item: dict = already_sampled_item_dict.get(dict_hash, set())

        valid_samples = sample_set.difference(already_sampled_item)
        sampled_item = random.choice(tuple(valid_samples)).strip()


        already_sampled_item.add(sampled_item)
        already_sampled_item_dict[dict_hash] = already_sampled_item
        self.sampled_player_sentence[sentence_id] = already_sampled_item_dict

        return sampled_item


sampler = Sampler(player_list, club_list, league_list)
sentence1 = "The player <player_x> plays for <club_x> in the <league_x> league"
sentence2 = "The player <player_y> in the <league_y> league"
print(sampler.sample("<player_x>", 1, str(hash(sentence1))))
print(sampler.sample("<player_y>", 1, str(hash(sentence2))))
print(sampler.sample("<player_z>", 1, str(hash(sentence1))))
print(sampler.sample("<league_x>", 1, str(hash(sentence1))))
print(sampler.sample("<league_y>", 1, str(hash(sentence2))))
print(sampler.sample("<club_x>", 1, str(hash(sentence1))))
print(sampler.sampled_player_sentence)

Bernard Dumot
Michel Albaladéjo
Salih Altın
Fußball-Regionalliga Südwest
Serie D
Brentford F.C.
{1: {3177319328087872474: {'Salih Altın', 'Bernard Dumot'}, -8233536537466523547: {'Michel Albaladéjo'}, 2821498810510939170: {'Fußball-Regionalliga Südwest'}, -8754017992745699427: {'Serie D'}, -5574422191687174199: {'Brentford F.C.'}}}


In [7]:
import re

def extract_tag(sentence: str):
    pattern = r'(<\w+_\w+>)'
    return re.findall(pattern, sentence)

print(extract_tag("What is the list of players who have appeared in both <league_x> and <league_y>?"))

['<league_x>', '<league_y>']


In [32]:
import random
from tqdm import tqdm

def generate_sentence(sentence_list: list, sentence_id: int, iter_count: int = 5000):
    sampler = Sampler(player_list, club_list, league_list)
    sentence_cypher = list()
    cypher_query = cypher_template.get(str(sentence_id))
    for _ in tqdm(range(iter_count)):
        sentence: str = random.choice(sentence_list)
        tags = extract_tag(sentence)

        # make a copy of the cypher query
        sentence_query = cypher_query
        for tag in tags:
            sample_name = sampler.sample(tag, sentence_id, str(hash(sentence)))
            sentence = sentence.replace(tag, tag + " " + sample_name)
            sentence_query = sentence_query.replace(tag, tag + " " + sample_name)

        sentence_cypher.append({
            "query": sentence,
            "cypher": sentence_query
        })    
    return sentence_cypher

print(generate_sentence(sentence_template["11"], 11, 5))

100%|██████████| 5/5 [00:00<00:00, 13715.84it/s]

[{'query': 'What players have the unique experience of playing in both <league_x> Football League First Division and <league_y> Segunda Federación?', 'cypher': "MATCH (p:Player)-[:PLAYED_FOR]->(:Club)-[:IS_IN]->(l:League) WHERE l.name IN ['<league_x> Football League First Division', '<league_y> Segunda Federación'] RETURN p.name"}, {'query': 'Who has split their career between <league_x> Regionalliga and <league_y> EFL League Two?', 'cypher': "MATCH (p:Player)-[:PLAYED_FOR]->(:Club)-[:IS_IN]->(l:League) WHERE l.name IN ['<league_x> Regionalliga', '<league_y> EFL League Two'] RETURN p.name"}, {'query': "Can you name the players who've competed in both <league_x> Oberliga Rheinland-Pfalz/Saar and <league_y> National League?", 'cypher': "MATCH (p:Player)-[:PLAYED_FOR]->(:Club)-[:IS_IN]->(l:League) WHERE l.name IN ['<league_x> Oberliga Rheinland-Pfalz/Saar', '<league_y> National League'] RETURN p.name"}, {'query': 'Which individuals have football history in both <league_x> Ligue 2 and <lea




In [22]:
import multiprocessing
import itertools
import json

# Define a function to generate sentences for a given sentence template
def generate_sentences(template_id):
    return generate_sentence(sentence_template[str(template_id)], int(template_id), iter_count=10)

def validate_query(query: str):
    with driver.session() as _:
        query = "PROFILE " + query
        try:
            driver.execute_query(query)
            return True
        except Exception as e:
            return False

# Create a multiprocessing pool
with multiprocessing.Pool(processes=6) as pool:
    # Iterate over all sentence templates and call generate_sentence function using multiprocessing
    results = pool.map(generate_sentences, sentence_template.keys())


combined_result = list(itertools.chain.from_iterable(results))
json.dump(combined_result, open("generated_sentences.json", "w", encoding="utf8"), indent=4, ensure_ascii=False)

# validate result
for result in combined_result:
    cypher = result.get("cypher")
    if not validate_query(cypher):
        combined_result.remove(result)
        print(cypher)

100%|██████████| 10/10 [00:00<00:00, 690.42it/s]
100%|██████████| 10/10 [00:00<00:00, 744.90it/s]
100%|██████████| 10/10 [00:00<00:00, 992.88it/s]
100%|██████████| 10/10 [00:00<00:00, 1074.47it/s]
100%|██████████| 10/10 [00:00<00:00, 1126.29it/s]
100%|██████████| 10/10 [00:00<00:00, 29269.39it/s]
100%|██████████| 10/10 [00:00<00:00, 11187.79it/s]
100%|██████████| 10/10 [00:00<00:00, 21788.59it/s]
100%|██████████| 10/10 [00:00<00:00, 68.54it/s]
100%|██████████| 10/10 [00:00<00:00, 70.51it/s]
100%|██████████| 10/10 [00:00<00:00, 70.16it/s]
