In [1]:
wikidata_user_agent_header = "DiegoBot /0.0"

In [2]:
def get_nested_value(o: dict, path: list) -> any:
    current = o
    for key in path:
        try:
            current = current[key]
        except:
            return None
    return current

import requests

from typing import Optional

def vocab_lookup(search: str, entity_type: str = "item",
                 url: str = "https://www.wikidata.org/w/api.php",
                 user_agent_header: str = wikidata_user_agent_header,
                 srqiprofile: str = None,
                ) -> Optional[str]:    
    headers = {
        'Accept': 'application/json'
    }
    if wikidata_user_agent_header is not None:
        headers['User-Agent'] = wikidata_user_agent_header
    
    if entity_type == "item":
        srnamespace = 0
        srqiprofile = "classic_noboostlinks" if srqiprofile is None else srqiprofile
    elif entity_type == "property":
        srnamespace = 120
        srqiprofile = "classic" if srqiprofile is None else srqiprofile
    else:
        raise ValueError("entity_type must be either 'property' or 'item'")          
    
    params = {
        "action": "query",
        "list": "search",
        "srsearch": search,
        "srnamespace": srnamespace,
        "srlimit": 1,
        "srqiprofile": srqiprofile,
        "srwhat": 'text',
        "format": "json"
    }
    
    response = requests.get(url, headers=headers, params=params)
        
    if response.status_code == 200:
        title = get_nested_value(response.json(), ['query', 'search', 0, 'title'])
        if title is None:
            return f"I couldn't find any {entity_type} for '{search}'. Please rephrase your request and try again"
        # if there is a prefix, strip it off
        return title.split(':')[-1]
    else:
        return "Sorry, I got an error. Please try again."

In [3]:
print(vocab_lookup("Malin 1"))

Q4180017


In [4]:
import requests
from typing import List, Dict, Any
import json

def run_sparql(query: str, url='https://query.wikidata.org/sparql',
               user_agent_header: str = wikidata_user_agent_header) -> List[Dict[str, Any]]:
    headers = {
        'Accept': 'application/json'
    }
    if wikidata_user_agent_header is not None:
        headers['User-Agent'] = wikidata_user_agent_header

    response = requests.get(url, headers=headers, params={'query': query, 'format': 'json'})

    if response.status_code != 200:
        return "That query failed. Perhaps you could try a different one?"
    results = get_nested_value(response.json(),['results', 'bindings'])
    return json.dumps(results)

In [5]:
run_sparql("SELECT (COUNT(?children) as ?count) WHERE { wd:Q1339 wdt:P40 ?children . }")


'[{"count": {"datatype": "http://www.w3.org/2001/XMLSchema#integer", "type": "literal", "value": "20"}}]'

In [6]:


from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser
from langchain.prompts import StringPromptTemplate
from langchain import OpenAI, LLMChain
from typing import List, Union
from langchain.schema import AgentAction, AgentFinish
import re



In [7]:


# Define which tools the agent can use to answer user queries
tools = [
    Tool(
        name = "ItemLookup",
        func=(lambda x: vocab_lookup(x, entity_type="item")),
        description="useful for when you need to know the q-number for an item"
    ),
    Tool(
        name = "PropertyLookup",
        func=(lambda x: vocab_lookup(x, entity_type="property")),
        description="useful for when you need to know the p-number for a property"
    ),
    Tool(
        name = "SparqlQueryRunner",
        func=run_sparql,
        description="useful for getting results from a wikibase"
    )    
]



In [8]:


# Set up the base template
template = """
Answer the following questions by running a sparql query against a wikibase where the p and q items are 
completely unknown to you. You will need to discover the p and q items before you can generate the sparql.
Do not assume you know the p and q items for any concepts. Always use tools to find all p and q items.
After you generate the sparql, you should run it. The results will be returned in json. 
Summarize the json results in natural language.

You may assume the following prefixes:
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>

When generating sparql:
* Try to avoid "count" and "filter" queries if possible
* Never enclose the sparql in back-quotes

You have access to the following tools:

{tools}

Use the following format:

Question: the input question for which you must provide a natural language answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Question: {input}
{agent_scratchpad}"""



In [9]:
# Set up a prompt template
class CustomPromptTemplate(StringPromptTemplate):
    # The template to use
    template: str
    # The list of tools available
    tools: List[Tool]
    
    def format(self, **kwargs) -> str:
        # Get the intermediate steps (AgentAction, Observation tuples)
        # Format them in a particular way
        intermediate_steps = kwargs.pop("intermediate_steps")
        thoughts = ""
        for action, observation in intermediate_steps:
            thoughts += action.log
            thoughts += f"\nObservation: {observation}\nThought: "
        # Set the agent_scratchpad variable to that value
        kwargs["agent_scratchpad"] = thoughts
        # Create a tools variable from the list of tools provided
        kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
        # Create a list of tool names for the tools provided
        kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
        return self.template.format(**kwargs)

In [10]:


prompt = CustomPromptTemplate(
    template=template,
    tools=tools,
    # This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
    # This includes the `intermediate_steps` variable because that is needed
    input_variables=["input", "intermediate_steps"]
)



In [11]:
class CustomOutputParser(AgentOutputParser):
    
    def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
        # Check if agent should finish
        if "Final Answer:" in llm_output:
            return AgentFinish(
                # Return values is generally always a dictionary with a single `output` key
                # It is not recommended to try anything else at the moment :)
                return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
                log=llm_output,
            )
        # Parse out the action and action input
        regex = r"Action: (.*?)[\n]*Action Input:[\s]*(.*)"
        match = re.search(regex, llm_output, re.DOTALL)
        if not match:
            raise ValueError(f"Could not parse LLM output: `{llm_output}`")
        action = match.group(1).strip()
        action_input = match.group(2)
        # Return the action and action input
        return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)


In [12]:
output_parser = CustomOutputParser()

In [14]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-4", temperature=0)

In [15]:


# LLM chain consisting of the LLM and a prompt
llm_chain = LLMChain(llm=llm, prompt=prompt)



In [16]:


tool_names = [tool.name for tool in tools]
agent = LLMSingleActionAgent(
    llm_chain=llm_chain, 
    output_parser=output_parser,
    stop=["\nObservation:"], 
    allowed_tools=tool_names
)



In [17]:

agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=True)



In [18]:


agent_executor.run("How many children did J.S. Bach have?")





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find the q-number for J.S. Bach.
Action: ItemLookup
Action Input: J.S. Bach[0m

Observation:[36;1m[1;3mQ1339[0m
[32;1m[1;3mI need to find the p-number for "number of children."
Action: PropertyLookup
Action Input: number of children[0m

Observation:[33;1m[1;3mP1971[0m
[32;1m[1;3mI can now run a sparql query to find the number of children J.S. Bach had.
Action: SparqlQueryRunner
Action Input: SELECT ?children WHERE { wd:Q1339 wdt:P1971 ?children }[0m

Observation:[38;5;200m[1;3m[{"children": {"datatype": "http://www.w3.org/2001/XMLSchema#decimal", "type": "literal", "value": "20"}}][0m
[32;1m[1;3mI now know the final answer.
Final Answer: J.S. Bach had 20 children.[0m

[1m> Finished chain.[0m


'J.S. Bach had 20 children.'

In [19]:
agent_executor.run("What is the Basketball-Reference.com NBA player ID of Hakeem Olajuwon?")



[1m> Entering new AgentExecutor chain...[0m


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 3f23368fa7bcf33e9a80b753f09a2377 in your message.).


[32;1m[1;3mThought: I need to find the Q number for Hakeem Olajuwon.
Action: ItemLookup
Action Input: Hakeem Olajuwon[0m

Observation:[36;1m[1;3mQ273256[0m
[32;1m[1;3mI need to find the P number for Basketball-Reference.com NBA player ID.
Action: PropertyLookup
Action Input: Basketball-Reference.com NBA player ID[0m

Observation:[33;1m[1;3mP2685[0m
[32;1m[1;3mI have the Q number for Hakeem Olajuwon and the P number for Basketball-Reference.com NBA player ID. Now I can create and run a SPARQL query to find the ID.
Action: SparqlQueryRunner
Action Input: 
SELECT ?id WHERE {
  wd:Q273256 wdt:P2685 ?id .
}[0m

Observation:[38;5;200m[1;3m[{"id": {"type": "literal", "value": "o/olajuha01"}}][0m
[32;1m[1;3mI now know the final answer.
Final Answer: The Basketball-Reference.com NBA player ID of Hakeem Olajuwon is "o/olajuha01".[0m

[1m> Finished chain.[0m


'The Basketball-Reference.com NBA player ID of Hakeem Olajuwon is "o/olajuha01".'

In [20]:
agent_executor.run("In what football teams did Romelo Lukaku play?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find the q-number for Romelo Lukaku.
Action: ItemLookup
Action Input: Romelo Lukaku[0m

Observation:[36;1m[1;3mI couldn't find any item for 'Romelo Lukaku'. Please rephrase your request and try again[0m
[32;1m[1;3mI made a typo in the name. It should be Romelu Lukaku.
Action: ItemLookup
Action Input: Romelu Lukaku[0m

Observation:[36;1m[1;3mQ313316[0m
[32;1m[1;3mI need to find the p-number for "member of sports team."
Action: PropertyLookup
Action Input: member of sports team[0m

Observation:[33;1m[1;3mP54[0m
[32;1m[1;3mI have the q-number for Romelu Lukaku (Q313316) and the p-number for "member of sports team" (P54). Now I can create a SPARQL query to find the football teams he played for.
Action: SparqlQueryRunner
Action Input: 
SELECT ?team ?teamLabel WHERE {
  wd:Q313316 wdt:P54 ?team .
  ?team wdt:P31 wd:Q476028 .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}

'Romelu Lukaku played for the following football teams: Inter Milan, Everton F.C., Chelsea F.C., Manchester United F.C., West Bromwich Albion F.C., and R.S.C. Anderlecht.'

In [21]:
agent_executor.run("In what movies did Johnathan Banks play?")



[1m> Entering new AgentExecutor chain...[0m


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID c9ed47aa9c3caa05a96a97854063bc7b in your message.).


[32;1m[1;3mThought: I need to find the q-number for Jonathan Banks.
Action: ItemLookup
Action Input: Jonathan Banks[0m

Observation:[36;1m[1;3mQ198638[0m
[32;1m[1;3mI need to find the p-number for "movies" or "films".
Action: PropertyLookup
Action Input: film[0m

Observation:[33;1m[1;3mP2529[0m
[32;1m[1;3mI need to find the p-number for "acted in" or "cast member".
Action: PropertyLookup
Action Input: cast member[0m

Observation:[33;1m[1;3mP161[0m
[32;1m[1;3mI have the q-number for Jonathan Banks and the p-numbers for "film" and "cast member". Now I can run a sparql query to find the movies he acted in.
Action: SparqlQueryRunner
Action Input: 
SELECT ?movie ?movieLabel WHERE {
  wd:Q198638 wdt:P161 ?movie .
  ?movie wdt:P31 wd:Q11424 .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}[0m

Observation:[38;5;200m[1;3m[][0m
[32;1m[1;3mI now know the final answer.
Final Answer: Jonathan Banks did not act in any movies according to the data ava

'Jonathan Banks did not act in any movies according to the data available in Wikidata.'