In [9]:
import openai

import yaml
import json
import os

from google.cloud import language
import pandas as pd
import requests

from neo4j import GraphDatabase
import shutil
from pathlib import Path


In [10]:


with open("config.yaml", "r") as stream:
    try:
        PARAM = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

os.environ['OPENAI_API_KEY'] = PARAM["OPENAI_API_KEY"]

openai.api_key = os.environ["OPENAI_API_KEY"]

#Add your Google service account JSON file to the root of your project and set the environment variable GOOGLE_APPLICATION_CREDENTIALS to the path of the JSON file here.
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ='./nlp-extraction-395102-7408a199af50.json'

In [11]:
with open('example.jsonl') as f:
    data = [json.loads(line) for line in f]

In [12]:
data[1]["text"]

'NVIDIA today announced a multi-year collaboration with Microsoft to build one of the most powerful AI supercomputers in the world, powered by Microsoft Azure’s advanced supercomputing infrastructure combined with NVIDIA GPUs, networking and full stack of AI software to help enterprises train, deploy and scale AI, including large, state-of-the-art models.'

In [13]:

def extract_seller_buyer_from_text(text: str, model_name: str = "gpt-3.5-turbo"):
    system_prompt = "You are a helpful relationship extractor."
    prompt = f"{text}\Who sells to whom. Answer strictly in this pairwise binary format: seller,buyer. If there are multiple pairs, list each pair in a new line."


    #print (prompt)
    completion = openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        functions=[
            {
            "name": "extract_relations", 
            "description": f"Extract the seller,buyer from the text.",
            "parameters": {
                "type": "object",
                "properties": {
                "seller-buyer": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": f"Who sells to whom. Answer strictly in this pairwise binary format: seller,buyer. If there are multiple pairs, list each pair in a new line."
                }

                },
                "required": ["seller-buyer"]
            }
            }
        ],
        function_call="auto",
        temperature=0,
    )
    
    message = completion.choices[0].message
    function_used = hasattr(message, "function_call")
    return message.function_call.arguments if function_used else message.content




In [14]:
#extract_seller_buyer_from_text("Microsoftの最新コンソールには、AMDのテクノロジーが採用されています。AMDのエンジニアはXbox One XおよびXbox One Sの設計者と緊密に連携し、強力なマルチコアAMD CPUおよびグラフィックス・テクノロジーを搭載する、カスタマイズされたプロセッサーを構築しました。")

In [15]:

def analyze_text_entities(text: str) -> language.AnalyzeEntitiesResponse:
    client = language.LanguageServiceClient()
    document = language.Document(
        content=text,
        type_=language.Document.Type.PLAIN_TEXT,
    )
    return client.analyze_entities(document=document)

def show_text_entities(response: language.AnalyzeEntitiesResponse):

    columns = ("name", "type", "salience", "mid", "wikipedia_url")
    data = (
        (
            entity.name,
            entity.type_.name,
            entity.salience,
            entity.metadata.get("mid", ""),
            entity.metadata.get("wikipedia_url", ""),
        )
        for entity in response.entities
    )
    df = pd.DataFrame(columns=columns, data=data)
    print(df.to_markdown(index=False, tablefmt="presto", floatfmt=".0%"))


In [16]:
def get_wikidata_id(wiki_url):
    url = 'https://query.wikidata.org/sparql'
    query = f'''
    prefix schema: <http://schema.org/>
    SELECT * WHERE {{
        <{wiki_url}> schema:about ?item .
    }}
    '''
    r = requests.get(url, params = {'format': 'json', 'query': query}, headers = {'User-agent': 'your bot 0.1'})
    data = r.json()
    return {"wikidata_id": data["results"]["bindings"][0]["item"]["value"].split("/")[-1]}


def get_wikidata_description(wikidata_id):
    url = 'https://query.wikidata.org/sparql'
    query = f'''
    SELECT ?description WHERE
    {{
        SERVICE wikibase:label {{
            bd:serviceParam wikibase:language "en" .
            wd:{wikidata_id} schema:description ?description .
        }}
    }}
    '''
    r = requests.get(url, params = {'format': 'json', 'query': query}, headers = {'User-agent': 'your bot 0.1'})
    data = r.json()
    #print (data)
    return {"description": data["results"]["bindings"][0]["description"]["value"]}


def get_father_item(wikidata_id):
    #print (wikidata_id)
    url = 'https://query.wikidata.org/sparql'
    query = f'''
    SELECT ?item ?itemLabel ?article ?description
    WHERE
    {{
    wd:{wikidata_id} wdt:P31 ?item.
    
    
    SERVICE wikibase:label {{
        bd:serviceParam wikibase:language "en" .
        ?item rdfs:label ?itemLabel .
        ?item schema:description ?description .
    }}
    
    OPTIONAL {{
        ?article schema:about ?item .
        ?article schema:inLanguage "en" .
        ?article schema:isPartOf <https://en.wikipedia.org/> .
    }}
    }} 
    '''

    #print (query)
    r = requests.get(url, params = {'format': 'json', 'query': query}, headers = {'User-agent': 'your bot 0.1'})
    #print ("r", r)
    data = r.json()
    #print ("data", data)

    results = []

    for x in data["results"]["bindings"]:
        results.append({"wikidata_id": x["item"]["value"].split("/")[-1], "name": x["itemLabel"]["value"], 
        "wikipedia_url": x["article"]["value"] if "article" in x else None,
        "description": x["description"]["value"] if "description" in x else None})

    return results

In [17]:
get_father_item("Q14772")

[{'wikidata_id': 'Q4830453',
  'name': 'business',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Business',
  'description': 'organization undertaking commercial, industrial, or professional activity'},
 {'wikidata_id': 'Q891723',
  'name': 'public company',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Public_company',
  'description': 'company that offers its securities for sale to the general public'},
 {'wikidata_id': 'Q4182287',
  'name': 'web search engine',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Search_engine',
  'description': 'software system that is designed to search for information on the World Wide Web'},
 {'wikidata_id': 'Q6881511',
  'name': 'enterprise',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Business_enterprise',
  'description': 'organizational unit producing goods or services, which benefits from a certain degree of autonomy in decision-making, especially for the allocation of its current resources'}]

In [18]:
nodes = {}
edges = set()

category_node = {}
category_edge = set()

for index in range(len(data)):
    temp_nodes = {}
    selected_nodes = set()

    txt = data[index]["text"]

    relation_extraction = json.loads(extract_seller_buyer_from_text(txt))
    seller_buyers = relation_extraction['seller-buyer']
    #print ("seller_buyers")

    ner = analyze_text_entities(txt)

    for entity in ner.entities:
        name = entity.name
        type_ = entity.type_.name
        wikipedia_url = ""

        if type_ == "ORGANIZATION":
            wikipedia_url = entity.metadata.get("wikipedia_url", "")

            if wikipedia_url != "":
                wikidata_id = get_wikidata_id(wikipedia_url)

                temp_nodes[name] =  {"name": name, "type": type_, "wikipedia_url": wikipedia_url, "wikidata_id": wikidata_id["wikidata_id"], "description": get_wikidata_description(wikidata_id["wikidata_id"])["description"]}

    print ("seller_buyers", seller_buyers)

    for seller_buyer in seller_buyers:
        seller = seller_buyer.split(",")[0]
        buyer = seller_buyer.split(",")[1]

        if seller in temp_nodes and buyer in temp_nodes:
            edges.add((temp_nodes[seller]['wikidata_id'], temp_nodes[buyer]['wikidata_id']))

            selected_nodes.add(seller)
            selected_nodes.add(buyer)

    print ("selected_nodes", selected_nodes)
    for node in selected_nodes:
        wikidata_id = temp_nodes[node]['wikidata_id']

        if wikidata_id not in nodes:
            nodes[wikidata_id] = temp_nodes[node]
        #print (wikidata_id, type(wikidata_id))

        fathers = get_father_item(wikidata_id)

        for father in fathers:
            if father["wikipedia_url"] is not None:
                category_node[father["name"]] = {"wikipedia_url": father["wikipedia_url"], "wikidata_id": father["wikidata_id"], "description": father["description"]}
                category_edge.add((wikidata_id,father["wikidata_id"]))

    #print ("category_node")



seller_buyers ['US chipmaker,Nvidia', 'Nvidia,Baidu', 'Nvidia,ByteDance', 'Nvidia,Tencent', 'Nvidia,Alibaba']
selected_nodes {'Baidu', 'ByteDance', 'Alibaba', 'Nvidia', 'Tencent'}
seller_buyers ['NVIDIA,Microsoft']
selected_nodes {'NVIDIA', 'Microsoft'}
seller_buyers ['AMD,Microsoft']
selected_nodes {'Microsoft', 'AMD'}


In [19]:
get_father_item("Q14772")

[{'wikidata_id': 'Q4830453',
  'name': 'business',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Business',
  'description': 'organization undertaking commercial, industrial, or professional activity'},
 {'wikidata_id': 'Q891723',
  'name': 'public company',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Public_company',
  'description': 'company that offers its securities for sale to the general public'},
 {'wikidata_id': 'Q4182287',
  'name': 'web search engine',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Search_engine',
  'description': 'software system that is designed to search for information on the World Wide Web'},
 {'wikidata_id': 'Q6881511',
  'name': 'enterprise',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Business_enterprise',
  'description': 'organizational unit producing goods or services, which benefits from a certain degree of autonomy in decision-making, especially for the allocation of its current resources'}]

In [20]:
nodes

{'Q14772': {'name': 'Baidu',
  'type': 'ORGANIZATION',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Baidu',
  'wikidata_id': 'Q14772',
  'description': 'Chinese web services company'},
 'Q55606242': {'name': 'ByteDance',
  'type': 'ORGANIZATION',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/ByteDance',
  'wikidata_id': 'Q55606242',
  'description': 'Internet technology company in China'},
 'Q1359568': {'name': 'Alibaba',
  'type': 'ORGANIZATION',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Alibaba_Group',
  'wikidata_id': 'Q1359568',
  'description': 'Chinese multinational technology company'},
 'Q182477': {'name': 'Nvidia',
  'type': 'ORGANIZATION',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Nvidia',
  'wikidata_id': 'Q182477',
  'description': 'American multinational technology company'},
 'Q860580': {'name': 'Tencent',
  'type': 'ORGANIZATION',
  'wikipedia_url': 'https://en.wikipedia.org/wiki/Tencent',
  'wikidata_id': 'Q860580',
  'description': 'Chinese mul

In [21]:
edges

{('Q128896', 'Q2283'),
 ('Q182477', 'Q1359568'),
 ('Q182477', 'Q14772'),
 ('Q182477', 'Q2283'),
 ('Q182477', 'Q55606242'),
 ('Q182477', 'Q860580')}

In [22]:
selected_nodes

{'AMD', 'Microsoft'}

In [23]:
seller_buyers

['AMD,Microsoft']

In [24]:


content = "from\tto\n"
for line in edges:
    content += f"{line[0]}\t{line[1]}\n"

output_file = open("neo4j/edge_sells_to.tsv", 'w')
output_file.write(content)
output_file.close()



In [25]:
content = "wikidata_id\tname\ttype\twikipedia_url\tdescription\n"
for node in nodes:
    name = nodes[node]['name']
    wikidata_id = nodes[node]['wikidata_id']
    type_ = nodes[node]['type']
    wikipedia_url = nodes[node]['wikipedia_url']
    description = nodes[node]['description']
    content += f"{wikidata_id}\t{name}\t{type_}\t{wikipedia_url}\t{description}\n"

output_file = open("neo4j/node_organization.tsv", 'w')
output_file.write(content)
output_file.close()

[{'wikidata_id': 'Q341',
  'name': 'free software',
  'wiki_url': 'https://en.wikipedia.org/wiki/Free_software',
  'description': 'software distributed under terms that allow users to freely run, study, change and distribute it and modified versions'},
 {'wikidata_id': 'Q218616',
  'name': 'proprietary software',
  'wiki_url': 'https://en.wikipedia.org/wiki/Proprietary_software',
  'description': 'computer software released under a license restricting use, study or redistribution'},
 {'wikidata_id': 'Q595971',
  'name': 'graph database',
  'wiki_url': 'https://en.wikipedia.org/wiki/Graph_database',
  'description': 'database which utilizes structure of discrete mathematical graphs to store and search data'},
 {'wikidata_id': 'Q99510066',
  'name': 'labeled property graph',
  'wiki_url': None,
  'description': 'area of research'}]

In [26]:
category_edge

{('Q128896', 'Q4830453'),
 ('Q128896', 'Q6881511'),
 ('Q128896', 'Q891723'),
 ('Q1359568', 'Q35127'),
 ('Q1359568', 'Q43229'),
 ('Q1359568', 'Q4830453'),
 ('Q1359568', 'Q6881511'),
 ('Q1359568', 'Q891723'),
 ('Q14772', 'Q4182287'),
 ('Q14772', 'Q4830453'),
 ('Q14772', 'Q6881511'),
 ('Q14772', 'Q891723'),
 ('Q182477', 'Q4830453'),
 ('Q182477', 'Q6881511'),
 ('Q182477', 'Q891723'),
 ('Q2283', 'Q1058914'),
 ('Q2283', 'Q18388277'),
 ('Q2283', 'Q4830453'),
 ('Q2283', 'Q891723'),
 ('Q55606242', 'Q1058914'),
 ('Q55606242', 'Q1589009'),
 ('Q55606242', 'Q18127'),
 ('Q55606242', 'Q4830453'),
 ('Q55606242', 'Q783794'),
 ('Q860580', 'Q219577'),
 ('Q860580', 'Q778575'),
 ('Q860580', 'Q891723')}

In [27]:
content = "from\tto\n"
for line in category_edge:
    
    content += f"{line[0]}\t{line[1]}\n"

output_file = open("neo4j/edge_organization_is_a.tsv", 'w')
output_file.write(content)
output_file.close()

In [28]:
category_node

{'business': {'wikipedia_url': 'https://en.wikipedia.org/wiki/Business',
  'wikidata_id': 'Q4830453',
  'description': 'organization undertaking commercial, industrial, or professional activity'},
 'public company': {'wikipedia_url': 'https://en.wikipedia.org/wiki/Public_company',
  'wikidata_id': 'Q891723',
  'description': 'company that offers its securities for sale to the general public'},
 'web search engine': {'wikipedia_url': 'https://en.wikipedia.org/wiki/Search_engine',
  'wikidata_id': 'Q4182287',
  'description': 'software system that is designed to search for information on the World Wide Web'},
 'enterprise': {'wikipedia_url': 'https://en.wikipedia.org/wiki/Business_enterprise',
  'wikidata_id': 'Q6881511',
  'description': 'organizational unit producing goods or services, which benefits from a certain degree of autonomy in decision-making, especially for the allocation of its current resources'},
 'company': {'wikipedia_url': 'https://en.wikipedia.org/wiki/Company',
  'wi

In [29]:
content = "wikidata_id\tname\twikipedia_url\tdescription\n"
for node in category_node:
        wikidata_id = category_node[node]['wikidata_id']

        wikipedia_url = category_node[node]['wikipedia_url']
        description = category_node[node]['description']
        content += f"{wikidata_id}\t{node.title()}\t{wikipedia_url}\t{description}\n"

output_file = open("neo4j/node_category.tsv", 'w')
output_file.write(content)
output_file.close()

In [30]:
url = "bolt://localhost:7687"
driver = GraphDatabase.driver(url, auth=(PARAM["neo4j_username"], PARAM["neo4j_password"]))


In [31]:
src = "neo4j"

home = str(Path.home())

dst = os.path.join(home, "Library", "Application Support", "Neo4j Desktop", "Application", "relate-data", "dbmss", PARAM["neo4j_project_str"], "import")


for root, dirs, files in os.walk(dst):
    for f in files:
        os.unlink(os.path.join(root, f))
    for d in dirs:
        shutil.rmtree(os.path.join(root, d))

for src_dir, dirs, files in os.walk(src):
    for file_ in files:
        src_file = os.path.join(src_dir, file_)
        dst_file = os.path.join(dst, file_)
        shutil.copy(src_file, dst_file)

In [32]:


with driver.session() as session: 
    session.run("MATCH (n) DETACH DELETE n")



In [33]:
for line in open("neo4j_command.txt", 'r'):
    if line.strip() != "":
        with driver.session() as session:
            session.run(line.strip())

In [35]:
get_wikidata_id("https://en.wikipedia.org/wiki/Nvidia")

{'wikidata_id': 'Q182477'}

In [38]:
analyze_text_entities("今般、第一号として、SMBC信用保証は株式会社沖縄海邦銀行（代表取締役頭取:新城一史）と保証業務提携契約を締結いたしました。")

entities {
  name: "SMBC信用保証"
  type_: OTHER
  salience: 0.264144033
  mentions {
    text {
      content: "SMBC信用保証"
      begin_offset: -1
    }
    type_: COMMON
  }
}
entities {
  name: "株式会社沖縄海邦銀行"
  type_: ORGANIZATION
  metadata {
    key: "wikipedia_url"
    value: "https://ja.wikipedia.org/wiki/沖縄海邦銀行"
  }
  metadata {
    key: "mid"
    value: "/g/1228rsjp"
  }
  salience: 0.204967543
  mentions {
    text {
      content: "株式会社沖縄海邦銀行"
      begin_offset: -1
    }
    type_: PROPER
  }
}
entities {
  name: "保証業務提携契約"
  type_: OTHER
  salience: 0.200605452
  mentions {
    text {
      content: "保証業務提携契約"
      begin_offset: -1
    }
    type_: COMMON
  }
}
entities {
  name: "代表取締役"
  type_: PERSON
  salience: 0.186825573
  mentions {
    text {
      content: "代表取締役"
      begin_offset: -1
    }
    type_: COMMON
  }
}
entities {
  name: "頭取"
  type_: PERSON
  salience: 0.143457398
  mentions {
    text {
      content: "頭取"
      begin_offset: -1
    }
    type_: COMMON
  