In [1]:
# LangChain supports many other chat models. Here, we're using Ollama
from langchain_community.chat_models import ChatOllama

from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate

from langchain_core.output_parsers import StrOutputParser

In [2]:
# # class to view the latest available supported parameters
# llm = ChatOllama(model="llama3")

# # Example
# prompt = ChatPromptTemplate.from_template("Tell me briefly what you know about {topic}")
# chain = prompt | llm | StrOutputParser()
# print(chain.invoke({"topic": "Venice of time 1741"}))

In [3]:
import re
import sys
from io import StringIO
import pandas as pd
from prompts import prompt_plan, prompt_code, prompt_debug, prompt_info

def read_questions(questions_path='data/matches.csv'):
    questions = pd.read_csv(questions_path)
    return questions

def get_llm(model='llama3', repeat_penalty=1.1, temperature=0.8, top_k=40, top_p=0.9):
    llm = ChatOllama(model=model, repeat_penalty=repeat_penalty, temperature=temperature, top_k=top_k, top_p=top_p)
    
    return llm

def get_planner(top_k=5):
    llm = get_llm(top_k=top_k)

    planner_prompt = PromptTemplate(
        template=prompt_plan,
        input_variables=["question", "entities_matches"],
    )

    planner = planner_prompt | llm | StrOutputParser()

    return planner

def get_debugger(top_k=5):
    llm = get_llm(top_k=top_k)

    debugger_prompt = PromptTemplate(
        template=prompt_debug,
        input_variables=["question", "entities_matches", "plan", "code", "error_message"],
    )

    debugger = debugger_prompt | llm | StrOutputParser()

    return debugger

def get_info(top_k=5):
    llm = get_llm(top_k=top_k)

    info_prompt = PromptTemplate(
        template=prompt_info,
        input_variables=["question", "code", "output"],
    )

    infoer = info_prompt | llm | StrOutputParser()

    return infoer

def get_coder(top_k=5):
    llm = get_llm(top_k=top_k)

    coder_prompt = PromptTemplate(
        template=prompt_code,
        input_variables=['answer_type', "question", "entities_matches", "plan"],
    )

    coder = coder_prompt | llm | StrOutputParser()

    return coder

def extract_python_code(text):
    # Find all code block matches in the text
    pattern = r'```python(.*?)```|```\s*(.*?)```|```Python(.*?)```'
    matches = re.findall(pattern, text, re.DOTALL)
    
    # Extract the code from matches
    code_blocks = [match[0] if match[0] else match[1] for match in matches]
    code_blocks = [code_block[len('python'):].lstrip() if code_block.lower().startswith('python') else code_block for code_block in code_blocks]
    code = '\n\n'.join(code_blocks).strip()
    
    return code

def execute_code(code):
    global_namespace = globals().copy()
    local_namespace = locals().copy()
    combined_namespace = {**global_namespace, **local_namespace}
    
    # Redirect stdout to capture printed output
    stdout_orig = sys.stdout
    sys.stdout = StringIO()

    try:
        # Execute the code in the combined namespace
        exec(code, combined_namespace)

        # Get the captured output
        output = sys.stdout.getvalue()
        return output.strip()
    finally:
        # Restore stdout
        sys.stdout = stdout_orig
        
def extract_content(text):
    # Use a regular expression to find content between [[ and ]]
    match = re.search(r'\[\[(.*?)\]\]', text)
    if match:
        # Return the content as a string
        return match.group(1)
    else:
        # Return None if no match is found
        return None

In [19]:
questions = read_questions(questions_path='out/out_code_1.csv')
questions = questions[questions['error_message'].isna()]

question_sample = questions.iloc[1]
question_sample

question             Which square has the highest density of tenant...
category                                                       spatial
answer_format                                                   entity
n_matches                                                            1
column_mappings      [('square', 'Landmark_Type', 3), ('tenants', '...
in_columns                                  [True, True, False, False]
phrase_matches       [{'square': {'dataset': 'data/landmarks.csv', ...
references           [{'location': {'dataset': 'data/buildings_1740...
n_matches_predict                                                    2
code                 import pandas as pd\nfrom math import radians,...
plan                 Here is the step-by-step plan to answer the qu...
output                                        campiello della malvasia
error_message                                                      NaN
Name: 2, dtype: object

In [4]:
question = """Which square has the larges number of commercial buildings within 100 meters in the dataset in 1740?"""

code = """import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians

# Load the datasets
buildings_1740 = pd.read_csv('data/buildings_1740.csv')
landmarks = pd.read_csv('data/landmarks.csv')

# Step 1: Filter the buildings dataset to only include commercial buildings
commercial_buildings = buildings_1740[buildings_1740['building_functions'].str.contains("calle, magazzeno")]

# Step 2: Calculate the distance from each commercial building to the nearest square
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat/2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2
    c = 2 * np.arctan2(sqrt(a), sqrt(1-a))
    distance = R * c
    return distance

distances = []
for index, row in commercial_buildings.iterrows():
    min_distance = float('inf')
    nearest_square_name = None
    for index_landmark, landmark_row in landmarks.iterrows():
        if landmark_row['landmark_type'] == 'square':
            distance = haversine(row['latitude'], row['longitude'], landmark_row['latitude'], landmark_row['longitude'])
            if distance < min_distance:
                min_distance = distance
                nearest_square_name = landmark_row['landmark_name']
    distances.append([row['location'], min_distance, nearest_square_name])

# Step 3: Filter the data to only include commercial buildings within 100 meters from a square
commercial_buildings_within_100m = [row for row in distances if row[1] <= 0.1]

# Step 4: Group and count the number of commercial buildings for each square
square_counts = {}
for row in commercial_buildings_within_100m:
    if row[2] not in square_counts:
        square_counts[row[2]] = 1
    else:
        square_counts[row[2]] += 1

# Step 5: Identify the square with the largest number of commercial buildings
square_with_most_commercial_buildings = max(square_counts, key=square_counts.get)

print(f"The answer is: [[{square_with_most_commercial_buildings}]]")"""

output = """The answer is: [[campo san giacomo]]"""

In [9]:
infoer = get_info(top_k=1)

info = infoer.invoke({
        "question": question,
        "code": f"```python\n{code}\n```",
        "output": output
})

print(info)

Here's the modified code to print the number of rows of the final dataset:

```Python
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians

# Load the datasets
buildings_1740 = pd.read_csv('data/buildings_1740.csv')
landmarks = pd.read_csv('data/landmarks.csv')

# Step 1: Filter the buildings dataset to only include commercial buildings
commercial_buildings = buildings_1740[buildings_1740['building_functions'].str.contains("calle, magazzeno")]

# Step 2: Calculate the distance from each commercial building to the nearest square
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat/2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2
    c = 2 * np.arctan2(sqrt(a), sqrt(1-a))
    distance = R * c
    return distance

distances = []
for index, row in commercial_buildings.iterrows():
    min_distance = float('inf')
    nearest_square_name = None
    for index_land

In [6]:
info_code = extract_python_code(info)
print(info_code)

import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians

# Load the datasets
buildings_1740 = pd.read_csv('data/buildings_1740.csv')
landmarks = pd.read_csv('data/landmarks.csv')

# Step 1: Filter the buildings dataset to only include commercial buildings
commercial_buildings = buildings_1740[buildings_1740['building_functions'].str.contains("calle, magazzeno")]

# Step 2: Calculate the distance from each commercial building to the nearest square
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat/2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2
    c = 2 * np.arctan2(sqrt(a), sqrt(1-a))
    distance = R * c
    return distance

distances = []
for index, row in commercial_buildings.iterrows():
    min_distance = float('inf')
    nearest_square_name = None
    for index_landmark, landmark_row in landmarks.iterrows():
        if landmark_row['landmark_type'] =

In [7]:
import traceback
try:
    output = execute_code(info_code)
    print(output)
    print("-"*20)
    print(extract_content(output))
except Exception:
    error_message = traceback.format_exc()
    print(error_message.split('exec(code, combined_namespace)')[1])

The answer is: [[campo san giacomo]]
The number of rows used is: [[1]]
--------------------
campo san giacomo


In [25]:
extract_content(output)

'1340699'

In [12]:
output = 'haha'

In [13]:
# code_o = output
print(code_o)

The answer is: [[campo san giacomo]]
The number of rows used is: [[1]]
