In [1]:
# ABM imports
import os
import mesa
import geopandas as gpd
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import json
import re
import copy

## MODELS

### LLM

In [2]:
# language model imports

import torch
from transformers import pipeline
import networkx as nx
import openai

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
language_model = pipeline(model="declare-lab/flan-alpaca-xl", device="cuda:0")

key_path = "openai_api_key"
#openai.organization = "org-D3T7qkglEsZGgYNCoTz3Uocx"
with open(key_path, "r") as f:
    api_key = f.readline()
os.environ["OPENAI_API_KEY"] = api_key

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.26s/it]


In [4]:
openai.api_key = api_key

In [37]:
def generate_text(prompt_background, prompt_text, use_openai = False, model_version = "gpt-3.5-turbo", max_tokens=4096):
    if use_openai:
        print("using openai "+model_version)
        completion = openai.ChatCompletion.create(
            model=model_version,
            max_tokens=max_tokens,
            messages=[
            {"role": "system", "content": prompt_background},
            {"role": "user", "content": prompt_text}
            ]
            )

        return completion.choices[0].message['content']
    else:
        # merge background and question for simple model
        prompt = prompt_background + prompt_text
        return language_model(prompt, do_sample=True, min_length=10, max_length=len(prompt)+128)[0]["generated_text"]

In [6]:
generate_text("You are a helpful assistant", "Hello, how are you", use_openai = True)

using openai gpt-3.5-turbo


"Hello! I'm just a virtual assistant, so I don't have feelings, but I'm here and ready to help you. How can I assist you today?"

### RCM SDM

In [7]:
os.environ["R_HOME"] = f"{os.environ['CONDA_PREFIX']}\\Lib\\R"
from rpy2 import robjects
from rpy2.robjects.packages import importr, data
utils = importr('utils')
base = importr('base')
devtools = importr('devtools')
readsdr = importr('readsdr')
mdl = readsdr.read_xmile('ResilientCommunityModel.xmile')
constants_rdf = readsdr.read_xmile_constants('ResilientCommunityModel.xmile')[1]
constants_df = pd.read_csv("constants.csv")

In [241]:
def rdf2df(rdf):
    return pd.DataFrame.from_dict({ key : np.asarray(rdf.rx2(key)) for key in rdf.names })

### San Diego Graph

In [8]:
sd_zips = gpd.read_file("Zip Codes/geo_export_a38603a0-51c6-4173-b442-f6f81034382f.shp")
sd_zips = sd_zips.to_crs('3310')
sd_zips.zip = sd_zips.zip.astype(int)
sd_zips.zip = sd_zips.zip.astype(str)

In [9]:
# add NEIGHBORS column
zip_graph_dict = {}


for index, row in sd_zips.iterrows():  
    neighbors = sd_zips[sd_zips.geometry.touches(row['geometry'])].zip.tolist() 
    try:
        neighbors = neighbors.remove(row.zip)
    except:
        None
    
    zip_graph_dict[row.zip] = neighbors

In [10]:
class Node:
    def __init__(self, zip_code, geom=None):
        self.zip_code = zip_code
        self.neighbors = []
        self.virus = None
        self.agency = []
        self.geom = sd_zips[sd_zips['zip'] == zip_code]['geometry'].values[0]
        self.infected = 0
        self.active_activities = {}
        self.population = int(sd_zips[sd_zips['zip'] == zip_code]['totpop'].values[0])
        
        
    def __str__(self):
        return self.zip_code
    
    def add_neighbor(self, neighbor_node):
        self.neighbors.append(neighbor_node)

In [11]:
graph = {}
for z in sd_zips.zip:
    graph[z] = Node(z)

    
for z in sd_zips.zip:
    neighbors = zip_graph_dict[z]
    for n in neighbors:
        graph[z].add_neighbor(graph[n])

## Data

### RCM Actions

In [60]:
actions = pd.read_csv('Copy of RCM pregame state 1.1 (MAP TO CONSTANTS - Activities and values .csv')
action_notna = actions[actions["RCM model param"].notna()]
action_budget_str = "activity: "+"\n ".join(action_notna["Specific Activity (SD)"])

In [243]:
action_notna

Unnamed: 0,Response Item,System Component,Activity - source white house 2021 plan,Specific Activity (US),System variable,Value (units),Specific Activity (SD),System variable.2,Value 0:1,Specific Activity (Team X),System variable.1,Unnamed: 11,RCM model param
0,Detection,Public health surveillance systems,"Monitoring for cases of the virus, both sympto...",CDC will deploy teams to affected areas to con...,Syndromic Surveillance,0.291019,San Diego County will use syndromic surveillan...,,,,,,S Surveillance transformer rate\nS Surveillanc...
2,,Contact tracing systems,Contact tracing to identify people who have be...,CDC will provide funding and technical assista...,CT converter,0.368799,SD enacts manual CICT,Contact Tracing and Case Identification,,,,,contact tracing\neffectiveness \nS Contact tr...
4,,Laboratory testing systems,Sequencing the virus to track its evolution.,CDC will support the development of new labora...,Testing multiplier? Timing?,,PCR testing,Central testing,0.466545,,,,testing\ncapacity\nmild test fraction
7,Community Mitigation Measures,Public health messaging systems,Public health messaging to inform the public a...,CDC will develop and distribute public health ...,Protective behavior converter?,,San Diego County will partner with local media...,Protective behavior rate,,,,,misinformation capacity
8,,Social media platforms,Social media campaigns to promote public healt...,CDC will use social media to promote public he...,,,San Diego County will work with local communit...,Protective behavior rate,,,,,misinformation capacity
12,,Hospitals,Provision of care to infected individuals.,CDC will provide guidance to hospitals on how ...,HC capacity converter,,San Diego County will work with local hospital...,Healthcare Capacity,,,,,testing\ncapacity
13,Healthcare System Preparedness and Response Ac...,Hospital surge capacity,"Increase in the number of beds, staff, and equ...",CDC will provide funding to hospitals to incre...,HC capacity converter,,San Diego County will work with local hospital...,Healthcare capacity,,,,,testing\ncapacity
14,,Triage,System for triaging patients to ensure that th...,CDC will provide guidance to hospitals on how ...,HC capacity converter,,San Diego County will work with local hospital...,Healthcare capacity,,,,,testing\ncapacity


In [17]:
pd.set_option('display.max_rows', None)
display(constants_df)
pd.set_option('display.max_rows', 5)

Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
0,0,STARTTIME,NA_character_,NA_character_,0.0,Day,CRITICAL: Day of the start of the game)
1,1,STOPTIME,NA_character_,NA_character_,28.0,Day,CRITICAL: Day of the end of the game)
2,2,misinformation capacity,NA_character_,NA_character_,0.75,Dmnl,CRITICAL: The proportion of people testing due...
3,3,infectivity,NA_character_,NA_character_,0.4,Dmnl,CRITICAL: This is the overall infectivity of t...
4,4,baseline contact\nrate,NA_character_,NA_character_,0.5,Per Day,CRITICAL: This is a baseline number of contact...
5,5,symptomatic\nrelative contact rate,NA_character_,NA_character_,0.05,dmnl,CRITICAL: This is the rate at which symptomati...
6,6,mildly symptomatic\nrelative contact rate,NA_character_,NA_character_,0.5,Dmnl,CRITICAL: This is the rate at which mildly sym...
7,7,tested\nrelative contact rate,NA_character_,NA_character_,0.005,Dmnl,MODERATE: This is the rate at which tested peo...
8,8,quarantine\neffectiveness,NA_character_,NA_character_,0.1,Dmnl,CRITICAL: This is the rate at which quarantine...
9,9,days symptomatic\nand contagious,Severity,Asymptomatic,7.0,Days,CRITICAL: This is the expected number of days ...


In [29]:
f = open('zip_distance.csv', "r")
dst_csv = f.read()

In [14]:
hhsa_description = """
The County of San Diego Health and Human Services Agency (HHSA) provides vital health, housing, and social services to more than 3.3 million residents across 18 cities, 18 federally recognized tribal reservations, 16 major naval and military installations, and the unincorporated areas of the County.  
About one in every three county residents is a direct recipient of HHSA services each year, emphasizing the critical role the Agency plays as a robust service network contributing to a region that is Building Better Health, Living Safely, and Thriving. This vision is played out in a collective effort called Live Well San Diego.
HHSA’s Guiding Principles 
The County of San Diego’s General Management System  and the Agency’s Vision, Mission, and Values guide what HHSA does best every day to advance opportunities for all San Diegans to live well. These guiding principles are outlined in the HHSA Strategic Plan.
"""

In [215]:
keywords = {"surveillance": ["syndromic", "surveillance"], 
 "cict": ["cict", "contact identification", "contact tracing"], 
 "testing": ["pcr", "testing"], 
 "misinformation capacity 1": ["local media outlets"], 
 "misinformation capacity 2": ["social media"], 
 "capacity 1": ["resource", "infected"],
 "capacity 2": ["surge capacity"],
 "capacity 3": ["triage"]}

In [245]:
#dummy budget to model param transformation:
budget_transform = copy.deepcopy(keywords)
a_keys = list(budget_transform.keys())
for i, a in enumerate(action_notna['RCM model param']):
    budget_transform[a_keys[i]] = {}
    for model_param in a.split('\n'):
        row = constants_df[constants_df['name'] == model_param.rstrip()]
        display(row)
        budget_transform[a_keys[i]][row.index.item()] = row['value'].item()/1000 #dummy number change per 1000 units
        # assign 1000 units to a single startegy would result in the rate doubling in model
budget_transform

Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
42,42,S Surveillance transformer rate,NA_character_,NA_character_,0.2,units per person,MODERATE: The rate at which utiles are convert...


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
57,57,S Surveillance increase rate,NA_character_,NA_character_,0.01,Per Day,CRITICAL: The rate at which S Surveillance res...


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
58,58,S Surveillance decline rate,NA_character_,NA_character_,0.01,Person per day,CRITICAL: The rate at which S Surveillance res...


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
94,94,S Surveillance exponent,NA_character_,NA_character_,0.25,Dmnl,NA_character_


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
19,19,contact tracing\neffectiveness,NA_character_,NA_character_,0.0,Dmnl,CRITICAL: The proportion of those contact trac...


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
43,43,S Contact tracing transformer rate,NA_character_,NA_character_,0.025,units per person,MODERATE: The rate at which utiles are convert...


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
55,55,S Contact tracing increase rate,NA_character_,NA_character_,0.01,Per Day,CRITICAL: The rate at which S Contact resource...


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
56,56,S Contact tracing decline rate,NA_character_,NA_character_,0.005,units per year,CRITICAL: The rate at which S Contact resource...


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
21,21,testing\ncapacity,NA_character_,NA_character_,100.0,Person/Day,CRITICAL: Overall testing capacity per day


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
18,18,mild test fraction,NA_character_,NA_character_,0.25,Dmnl,CRITICAL: The proportion of those with mild sy...


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
2,2,misinformation capacity,NA_character_,NA_character_,0.75,Dmnl,CRITICAL: The proportion of people testing due...


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
2,2,misinformation capacity,NA_character_,NA_character_,0.75,Dmnl,CRITICAL: The proportion of people testing due...


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
21,21,testing\ncapacity,NA_character_,NA_character_,100.0,Person/Day,CRITICAL: Overall testing capacity per day


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
21,21,testing\ncapacity,NA_character_,NA_character_,100.0,Person/Day,CRITICAL: Overall testing capacity per day


Unnamed: 0.1,Unnamed: 0,name,dimensions,subscript,value,units,doc
21,21,testing\ncapacity,NA_character_,NA_character_,100.0,Person/Day,CRITICAL: Overall testing capacity per day


{'surveillance': {42: 0.0002, 57: 1e-05, 58: 1e-05, 94: 0.00025},
 'cict': {19: 0.0, 43: 2.5e-05, 55: 1e-05, 56: 5e-06},
 'testing': {21: 0.1, 18: 0.00025},
 'misinformation capacity 1': {2: 0.00075},
 'misinformation capacity 2': {2: 0.00075},
 'capacity 1': {21: 0.1},
 'capacity 2': {21: 0.1},
 'capacity 3': {21: 0.1}}

### ABM <p style="color:red">(This need rework for gpt)</p>

In [12]:
class Agency(mesa.Agent):
    prompt_dict = """
            summarize the strategies and their alocated budget, 
            transform it to json format with zipcode string as key, activities as sub-key, budget value as values
            An example would be:

            {
              "92122": {
                "Activity 1": 1000,
                "Activity 2": 100
              },
              "92122": {
                "Activity 1": 1000
              }
            }

            your response should be a plain json
            """


    def __init__(self, unique_id, model, name, description, zip_codes, budget):
        super().__init__(unique_id, model)
        self.name = name
        self.description = description
        self.zip_codes = zip_codes
        self.starting_budget = budget
        self.budget = budget
        
    def step(self):
        prompt_background = """Your are {}
        The description of your organization is as follows:""".format(self.name)
        prompt_background += hhsa_description
        prompt_background += """Your goal is to mitigate covid transmission in San Diego county with a provided set of mitigation strategies. 
        You only have a finite amount of resources to perform mitigation activities.
        These resources are represented using "units".
        All of your mitigation strategies will be deployed for a month. Starting from month 0.
        Adjust your strategies accordingly to maximize cost and benefit of your budget versu mitigation results."""

                
        prompt_text = ""

        prompt_text += """It is currently month {}. """.format(self.model.schedule.time)
        prompt_text += """There currently are {} """.format(self.eval_status())
        #prompt_text += """ You can perform the following activities to mitigate covid effect: {} """.format(" ".join(actions["Specific Activity (SD)"][:3]))
        # all actions with no budget guideline
        prompt_text += """ You can perform the following activities to mitigate covid effect: {} """.format(action_budget_str)
        prompt_text += """ You currently have a total resource of {} units. """.format(self.budget)
        #prompt_text += """ The budget range for each activities are: {} """.format(action_budget_str)
        prompt_text += """ DO NOT allocate more than 10% of the starting total budget ({} units) to all activities combined. """.format(self.starting_budget )
        #prompt_text += """ What activites would you perform? What budget would you deploy for the chosen methods? On which zipcode? """
        prompt_text += """ What activites and budget would you deploy for the chosen methods? On which zipcode? """


        prompt_text += """
        Respond in concise language and explaining your approach.
        DO look up information with your tool.
        DO cite your sources.
        DO NOT cite any source that you did not look up.
        """ 
        
        gpt_out = generate_text(prompt_background, prompt_text, use_openai = True, model_version = "gpt-4-0125-preview")
        print(gpt_out)
        json_out = generate_text(prompt_dict, gpt_out, use_openai=True)
        # text refine with regex
        json_out = re.sub(r'```json', '', json_out)
        json_out = re.sub(r'```', '', json_out)
        json_out = re.sub(r'\s\s+', ' ', json_out)
        #load model text output to dictionary/json
        model_out = json.loads(json_out.lower())
        
        
        self.model.llm_out = model_out
        

    def eval_status(self):
        
        # check zip status
        out = ""
        for node in self.model.graph.values():
            if node.infected > 0:
                out += "{} covid cases in zipcode {}, ".format(node.infected, node.zip_code)      
                
        return out

In [227]:
class Virus(mesa.Agent):

    def __init__(self, unique_id, model, zip_code):
        super().__init__(unique_id, model)
        
        # grab base transmission rate 
        # dummy stat from https://virologyj.biomedcentral.com/articles/10.1186/s12985-021-01609-w
        self.trans_rate = 0.0112
        self.zip_code = zip_code
        self.model.graph[zip_code].virus = self
        self.new_virus_count = 0
        
        
    def step(self):
        model_out = self.model.llm_out
        dict_out = {}
        total_usage = 0
        for zipcode in model_out.keys():
            dict_out[zipcode] = {}
            for strat in model_out[zipcode]:
                    for k in keywords:
                        for kword in keywords[k]:
                            if (kword in strat):
                                total_usage += model_out[zipcode][strat]
                                dict_out[zipcode][k] = model_out[zipcode][strat]
                                break
        
        
        mdl_curr = copy.deepcopy(mdl)
        rcm_input = mdl_curr[mdl_curr.names.index('deSolve_components')]
        for zipcode in dict_out:
            for strat in budget_transform:
                for param_idx in budget_transform[strat]:
                    val = dict_out[zipcode][strat]*budget_transform[strat][param_idx]
                    rcm_input[1][param_idx] += val
                    
        #how to interpret this?            
        out_rdf = readsdr.sd_simulate(rcm_input)
                                
        

        

In [15]:
class NaiveModel(mesa.Model):
    def __init__(self, graph):

        self.running = True
        self.schedule = mesa.time.BaseScheduler(self)
        
        self.graph = graph
        # currently using dummy curve
        #self.curves = curves
        self.llm_out = {}
        
        # add hardcoded agents
        # generate new conversation for each agency, use conversation id as unique id
        conversation_id = claude_api.create_new_chat()['uuid']
        sdhhsa = Agency(unique_id=conversation_id,
                        model=self, 
                        name="San Diego Human Health and Services Agency", 
                        description=hhsa_description,
                        zip_codes=None, 
                        budget=20000)
        self.schedule.add(sdhhsa)
        
        virus = Virus(unique_id="BASE", 
                      model=self, 
                      zip_code="92037")
        self.schedule.add(virus)
        
        
    def step(self):
        '''
        step function of the model that would essentially call the step function of all agents
        '''
        # inject random 10-50 init patients to starting zipcode
        if self.schedule.time==0:
            # currently hard coded to 92037 as init zip
            self.graph['92037'].infected = np.random.choice(np.arange(10,50))
            
            
        self.schedule.step()

## Test Ground

In [30]:
name = "San Diego Human Health and Services Agency"
prompt_background = """Your are {}
The description of your organization is as follows:""".format(name)
prompt_background += hhsa_description
prompt_background += """Your goal is to mitigate covid transmission in San Diego county with a provided set of mitigation strategies. 
You only have a finite amount of resources to perform mitigation activities.
These resources are represented using "units".
All of your mitigation strategies will be deployed for a month. Starting from month 0.
Adjust your strategies accordingly to maximize cost and benefit of your budget versu mitigation results."""

In [49]:
prompt_text = ""

prompt_text += """It is currently month {}. """.format(0)#self.model.schedule.time)
prompt_text += """There currently are {} """.format("20 patients in 91905")#self.eval_status())
#prompt_text += """ You can perform the following activities to mitigate covid effect: {} """.format(" ".join(actions["Specific Activity (SD)"][:3]))
# all actions with no budget guideline
prompt_text += """ You can perform the following activities to mitigate covid effect: {} """.format(action_budget_str)
prompt_text += """ You currently have a total resource of {} units. """.format(10000)
#prompt_text += """ The budget range for each activities are: {} """.format(action_budget_str)
prompt_text += """ DO NOT allocate more than 10% of the starting total budget ({} units) to all activities combined. """.format(10000)
#prompt_text += """ What activites would you perform? What budget would you deploy for the chosen methods? On which zipcode? """
prompt_text += """ What activites and budget would you deploy for the chosen methods? On which zipcode? """


prompt_text += """
Respond in concise language and explaining your approach.
DO look up information with your tool.
DO cite your sources.
DO NOT cite any source that you did not look up.
""" 

#prompt_text += """The distances bewteen San Diego zipcode are provided as the following csv file, make use of this information:"""
#prompt_text += dst_csv

In [228]:
%%time
gpt_out = generate_text(prompt_background, prompt_text, use_openai = True, model_version = "gpt-4-0125-preview")

using openai gpt-4-0125-preview
CPU times: total: 125 ms
Wall time: 31 s


In [229]:
print(gpt_out)

Given the goal to effectively mitigate COVID-19 transmission in San Diego County, with a specific emphasis on the 91905 ZIP code where there are currently 20 patients, and a budget constraint not to allocate more than 10% of the 10,000 units to all activities combined, a strategic, multi-faceted approach is essential. This means we have 1,000 units (10% of 10,000) to spend on our mitigation strategies for this month.

The chosen strategies focus on early detection, public awareness, and strengthening healthcare response to manage current cases and prevent further spread. The allocations are designed to balance cost and effectiveness within the given budget constraints.

### Chosen Activities and Budget Allocation:

1. **San Diego County will use syndromic surveillance to detect the virus (150 units):**

2. **SD enacts manual CICT (Contact Identification and Contact Tracing) (200 units):**
   A robust manual CICT effort can help contain the outbreak by tracing and managing contacts of c

In [230]:
prompt_dict = """
        summarize the strategies and their alocated budget, 
        transform it to json format with zipcode string as key, activities as sub-key, budget value as values
        An example would be:

        {
          "92122": {
            "Activity 1": 1000,
            "Activity 2": 100
          },
          "92122": {
            "Activity 1": 1000
          }
        }

        your response should be a plain json
        """

In [231]:
%%time
json_out = generate_text(prompt_dict, gpt_out, use_openai=True)


using openai gpt-3.5-turbo
CPU times: total: 0 ns
Wall time: 2.14 s


In [232]:
json_out = re.sub(r'```json', '', json_out)
json_out = re.sub(r'```', '', json_out)
json_out = re.sub(r'\s\s+', ' ', json_out)
model_out = json.loads(json_out.lower())

In [233]:
model_out

{'91905': {'syndromic surveillance': 150,
  'cict (contact identification and contact tracing)': 200,
  'pcr testing': 150,
  'promote public health messaging via local media outlets': 100,
  'promote public health messaging via local community organizations on social media': 100,
  'ensure local hospitals have necessary resources for infected individuals': 100,
  "increase local hospitals' surge capacity": 100,
  'develop a triage system with local hospitals': 100}}

In [234]:
dict_out = {}
total_usage = 0
for zipcode in model_out.keys():
    dict_out[zipcode] = {}
    for strat in model_out[zipcode]:
            for k in keywords:
                for kword in keywords[k]:
                    if (kword in strat):
                        total_usage += model_out[zipcode][strat]
                        dict_out[zipcode][k] = model_out[zipcode][strat]
                        break

In [235]:
dict_out

{'91905': {'surveillance': 150,
  'cict': 200,
  'testing': 150,
  'misinformation capacity 1': 100,
  'misinformation capacity 2': 100,
  'capacity 1': 100,
  'capacity 2': 100,
  'capacity 3': 100}}

In [249]:
budget_transform

{'surveillance': {42: 0.0002, 57: 1e-05, 58: 1e-05, 94: 0.00025},
 'cict': {19: 0.0, 43: 2.5e-05, 55: 1e-05, 56: 5e-06},
 'testing': {21: 0.1, 18: 0.00025},
 'misinformation capacity 1': {2: 0.00075},
 'misinformation capacity 2': {2: 0.00075},
 'capacity 1': {21: 0.1},
 'capacity 2': {21: 0.1},
 'capacity 3': {21: 0.1}}

In [236]:
0.023

1000

In [246]:
mdl_curr = copy.deepcopy(mdl)
rcm_input = mdl_curr[mdl_curr.names.index('deSolve_components')]
default_input = mdl[mdl.names.index('deSolve_components')]
for zipcode in dict_out:
    for strat in budget_transform:
        for param_idx in budget_transform[strat]:
            val = dict_out[zipcode][strat]*budget_transform[strat][param_idx]
            rcm_input[1][param_idx] += val

In [250]:
%%time
out_rdf = readsdr.sd_simulate(rcm_input)

CPU times: total: 1min 1s
Wall time: 1min 1s


In [251]:
rdf2df(out_rdf)

Unnamed: 0,time,recent_testing_positive,infected_not_contagious_Asymptomatic_1,infected_not_contagious_Asymptomatic_2,infected_not_contagious_Asymptomatic_3,infected_not_contagious_Asymptomatic_4,infected_not_contagious_Asymptomatic_5,infected_not_contagious_Asymptomatic_6,infected_not_contagious_Asymptomatic_7,infected_not_contagious_Asymptomatic_8,...,E_Social_exponent,E_Financial_exponent,U_Other_exponent,U_Human_exponent,U_Social_exponent,U_Financial_exponent,S_Exposure_Notification_exponent,S_Surveillance_exponent,S_Contact_tracing_exponent,S_Environmental_Testing_exponent
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0009006156,0.0009736385,0.001052582,0.001137927,0.001230191,0.001329936,0.001437769,0.001554345,0.001680373,0.001816619
1,0.1,0.01268036,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.0009648758,0.001043109,0.001127685,0.001219119,0.001317967,0.001424829,0.001540356,0.001665249,0.00180027,0.001946237
2,0.2,0.03775308,0.04779964,0.04889982,0.04889982,0.04889982,0.04889982,0.04889982,0.04889982,0.04889982,...,0.001033721,0.001117536,0.001208147,0.001306105,0.001412005,0.001526492,0.001650262,0.001784067,0.001928721,0.002085104
3,0.3,0.08733899,0.04256324,0.04569548,0.04569548,0.04569548,0.04569548,0.04569548,0.04569548,0.04569548,...,0.001107478,0.001197274,0.00129435,0.001399297,0.001512754,0.00163541,0.00176801,0.001911363,0.002066338,0.002233879
4,0.4,0.1854587,0.03686961,0.04112594,0.04112594,0.04112594,0.04112594,0.04112594,0.04112594,0.04112594,...,0.001186498,0.001282701,0.001386704,0.001499139,0.001620691,0.001752098,0.00189416,0.002047741,0.002213774,0.002393269
5,0.5,0.3798655,0.03332638,0.03701334,0.03701334,0.03701334,0.03701334,0.03701334,0.03701334,0.03701334,...,0.001271157,0.001374223,0.001485647,0.001606105,0.00173633,0.001877113,0.002029311,0.00219385,0.00237173,0.002564032
6,0.6,0.7656072,0.02997937,0.03331201,0.03331201,0.03331201,0.03331201,0.03331201,0.03331201,0.03331201,...,0.001361855,0.001472276,0.00159165,0.001720703,0.001860219,0.002011048,0.002174105,0.002350384,0.002540956,0.002746979
7,0.7,1.530628,0.02698287,0.02998081,0.02998081,0.02998081,0.02998081,0.02998081,0.02998081,0.02998081,...,0.001459026,0.001577325,0.001705216,0.001843477,0.001992948,0.002154539,0.002329231,0.002518087,0.002722257,0.00294298
8,0.8,3.048743,0.02428444,0.02698273,0.02698273,0.02698273,0.02698273,0.02698273,0.02698273,0.02698273,...,0.001563129,0.001689869,0.001826886,0.001975012,0.002135148,0.002308268,0.002495425,0.002697756,0.002916493,0.003152966
9,0.9,6.063161,0.02185601,0.02428445,0.02428445,0.02428445,0.02428445,0.02428445,0.02428445,0.02428445,...,0.00167466,0.001810444,0.001957236,0.002115931,0.002287493,0.002472966,0.002673476,0.002890245,0.003124589,3.122502e-18


In [66]:
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4")

In [67]:
len(enc.encode(prompt_background + prompt_text))

453

In [68]:
import langchain
from langchain_experimental.agents import create_csv_agent
from langchain.llms import OpenAI

In [80]:
agent = create_csv_agent(OpenAI(temperature=0), 'zip_distance_10.csv', verbose=True)

In [91]:
pd.read_csv('zip_distance_10.csv')

Unnamed: 0.1,Unnamed: 0,91901,91902,91905,91906,91910,91911,92028,92029,92036,92058
0,91901,0.000000,16720.376410,18036.456181,0.000000,23268.508817,26235.432517,53731.013711,30844.846324,0.000000,67861.384703
1,91902,16720.376410,0.000000,55009.549076,31867.014375,0.000000,1987.196891,68952.910576,39687.486186,34868.601522,71099.697281
...,...,...,...,...,...,...,...,...,...,...,...
8,92036,0.000000,34868.601522,0.000000,12037.989711,42129.762198,45131.169947,48993.466207,31222.520185,0.000000,67881.422954
9,92058,67861.384703,71099.697281,107139.990747,95802.835280,71313.162199,75210.295810,11883.278166,25877.486044,67881.422954,0.000000


In [89]:
agent.run("You are provided with a subset of distance data of San Diego zipcode. Each cell contains the shortest distance (in feet) between zipcode pair in column/row." + "What is the distance between 92101 and 92103")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find the row and column corresponding to the zipcodes 92101 and 92103 and then access the value in that cell.
Action: python_repl_ast
Action Input: df.loc[92101, 92103][0m[36;1m[1;3mKeyError: 92103[0m[32;1m[1;3m92103 is not a column name, I need to use the index instead.
Action: python_repl_ast
Action Input: df.loc[92101, '92103'][0m[36;1m[1;3mKeyError: '92103'[0m[32;1m[1;3m92103 is not a row index either, I need to use the column name instead.
Action: python_repl_ast
Action Input: df.loc['92101', '92103'][0m[36;1m[1;3mKeyError: '92103'[0m[32;1m[1;3m92103 is not a valid column name, I need to use the index instead.
Action: python_repl_ast
Action Input: df.loc['92101', 92103][0m[36;1m[1;3mKeyError: 92103[0m[32;1m[1;3m92103 is not a valid row index either, I need to use the column name instead.
Action: python_repl_ast
Action Input: df.loc['92101', '92103'][0m[36;1m[1;3mKeyError: '92

'Agent stopped due to iteration limit or time limit.'

In [87]:
agent.run("You are provided with a subset of distance data of San Diego zipcode. Each cell contains the shortest distance (in feet) between zipcode pair in column/row." + "Your are San Diego Human Health and Services Agency." + prompt_text)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find the total number of infected individuals in zipcode 91905 and determine the budget needed for each activity.
Action: python_repl_ast
Action Input: df.loc[df['Unnamed: 0'] == 91905][0m[36;1m[1;3m   Unnamed: 0         91901         91902  91905  91906        91910  \
2       91905  18036.456181  55009.549076    0.0    0.0  58594.46784   

          91911         92028        92029  92036          92058  
2  61129.755374  89982.151872  70632.67476    0.0  107139.990747  [0m[32;1m[1;3m I need to sum the values in the 91905 row to find the total number of infected individuals.
Action: python_repl_ast
Action Input: df.loc[df['Unnamed: 0'] == 91905].sum(axis=1)[0m[36;1m[1;3m2    552430.045851
dtype: float64[0m[32;1m[1;3m I now know the total number of infected individuals in zipcode 91905.
Action: python_repl_ast
Action Input: 552430.045851[0m[36;1m[1;3m552430.045851[0m[32;1m[1;3m I need to 

'The budget needed for each activity in zipcode 91905 is $131,292,857.14.'

In [73]:
agent.run("How many columns are there in this table?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to count the number of columns in the dataframe.
Action: python_repl_ast
Action Input: len(df.columns)[0m[36;1m[1;3m114[0m[32;1m[1;3m114 columns is a lot, but that is the answer to the question.
Final Answer: There are 114 columns in this table.[0m

[1m> Finished chain.[0m


'There are 114 columns in this table.'

In [74]:
f = open('zip_adjacency.csv', "r")
adj_csv = f.read()

In [75]:
len(enc.encode(adj_csv))

26217