In [3]:
import pandas as pd
import numpy as np
import catllm as cat
import os
from dotenv import load_dotenv, find_dotenv

In [None]:
os.chdir('YOUR_PATH_HERE')  # Change to your desired directory
current_directory = os.getcwd()
print(current_directory)

In [None]:
_ = load_dotenv(find_dotenv()) # read local .env file

api_key = os.getenv("ANTHROPIC_API_KEY")

Below we test the function to see how well the model can extract people's academic departments.

Although these are technically correct, it sometimes pulls from older data (implies search question needs to be more specific)

In [7]:
list_names = ["Chris Soria", "Matthew Stenberg", "Sara Quigley"]

test1 = cat.build_web_research_dataset(
    search_question="Academic Department they belong to at UC Berkeley?",
    search_input=list_names,
    api_key=api_key,
    answer_format="just the department name",
    time_delay=10,
    user_model="claude-sonnet-4-20250514")

test1.head()

1. Answer
2. URL


Building dataset:  33%|███▎      | 1/3 [00:11<00:22, 11.04s/it]

{
  "answer": "Sociology",
  "url": "https://christophersoria.tumblr.com/"
}


Building dataset:  67%|██████▋   | 2/3 [00:29<00:15, 15.21s/it]

Based on the search results, I can see that Matthew Stenberg is associated with the Political Science Department at UC Berkeley. The official UC Berkeley Political Science Department page confirms this information.

```json
{
  "answer": "Political Science Department",
  "url": "https://polisci.berkeley.edu/people/person/matthew-stenberg"
}
```


Building dataset: 100%|██████████| 3/3 [00:46<00:00, 15.46s/it]

Based on the search results, I found information about Sara Quigley at UC Berkeley. According to the official UC Berkeley Graduate Division website and other sources, Sara Quigley works in the Graduate Division as Assistant Dean for Institutional Research & Data Analytics.

```json
{
  "answer": "Graduate Division",
  "url": "https://grad.berkeley.edu/staff/sara-quigley/"
}
```





Unnamed: 0,survey_response,link1,json,answer,url
0,Chris Soria,"{\n ""answer"": ""Sociology"",\n ""url"": ""https:/...","{""answer"": ""Sociology"", ""url"": ""https://christ...",Sociology,https://christophersoria.tumblr.com/
1,Matthew Stenberg,"Based on the search results, I can see that Ma...","{""answer"": ""Political Science Department"", ""ur...",Political Science Department,https://polisci.berkeley.edu/people/person/mat...
2,Sara Quigley,"Based on the search results, I found informati...","{""answer"": ""Graduate Division"", ""url"": ""https:...",Graduate Division,https://grad.berkeley.edu/staff/sara-quigley/


What happens if we direct it to a specific website, but the person doesn't appear on that webiste?
It pulls the data from somewhere else

In [21]:
list_names = ["Francoise Sorgen", "Ayesha Mahmud", "Dennis Feehan"]

test3 = cat.build_web_research_dataset(
    search_question="Where these UC Berkeley professors got their PhD according to Linkedin?",
    search_input=list_names,
    api_key=api_key,
    answer_format="just the school name",
    time_delay=15,
    user_model="claude-sonnet-4-20250514",
    safety=True,
    additional_instructions="Do not give me information for any degree that's not a PhD degree. If you can't find that they have a PhD, just say 'Not found'" # extra instruction to improve accuracy
)

test3

1. Answer
2. URL


Building dataset:  33%|███▎      | 1/3 [00:12<00:24, 12.04s/it]

Based on my search results, I found information about Francoise Sorgen-Goldschmidt, who was a French professor at UC Berkeley. According to her LinkedIn profile, her education is listed as Sorbonne University. However, the search results do not specifically indicate that she has a PhD degree - they only mention her education at Sorbonne University without specifying the degree type.

```json
{
    "answer": "Not found",
    "url": "https://www.linkedin.com/in/francoise-sorgen-goldschmidt-35908229/"
}
```


Building dataset:  67%|██████▋   | 2/3 [00:32<00:16, 16.84s/it]

{
"answer": "Princeton University",
"url": "https://publichealth.berkeley.edu/people/ayesha-mahmud"
}


Building dataset: 100%|██████████| 3/3 [00:53<00:00, 17.71s/it]

Based on the search results, I found information about Dennis Feehan's PhD. According to his CV and multiple official sources, he completed his PhD at Princeton University.

```json
{
    "answer": "Princeton University",
    "url": "https://dennisfeehan.org/cv/cv.html"
}
```





Unnamed: 0,survey_response,link1,json,answer,url
0,Francoise Sorgen,"Based on my search results, I found informatio...","{""answer"": ""Not found"", ""url"": ""https://www.li...",Not found,https://www.linkedin.com/in/francoise-sorgen-g...
1,Ayesha Mahmud,"{\n""answer"": ""Princeton University"",\n""url"": ""...","{""answer"": ""Princeton University"", ""url"": ""htt...",Princeton University,https://publichealth.berkeley.edu/people/ayesh...
2,Dennis Feehan,"Based on the search results, I found informati...","{""answer"": ""Princeton University"", ""url"": ""htt...",Princeton University,https://dennisfeehan.org/cv/cv.html


What happens if i give it a city name that's confusing? 
It fails to find the city's weather. 

In [20]:
list_names = ["Monterey, CA", "San Diego, CA", "Paris, Texas"]
list_names_specific = ["Monterey, CA", "San Diego, CA", "Paris, Texas in United States"]

test2 = cat.build_web_research_dataset(
    search_question="Hottest temperature in 2024 from extremeweatherwatch?",
    search_input=list_names_specific,
    api_key=api_key,
    answer_format="just the temperature in Fahrenheit",
    time_delay=15,
    user_model="claude-sonnet-4-20250514"
)

test2

1. Answer
2. URL


Building dataset:  33%|███▎      | 1/3 [00:05<00:10,  5.10s/it]

{
  "answer": "90°F",
  "url": "https://www.extremeweatherwatch.com/cities/monterey/year-2024"
}


Building dataset:  67%|██████▋   | 2/3 [01:13<00:42, 42.49s/it]

{
  "answer": "94°F",
  "url": "https://www.extremeweatherwatch.com/cities/san-diego/year-2024"
}


Building dataset: 100%|██████████| 3/3 [01:47<00:00, 35.88s/it]

Based on my search results, I was unable to find specific information about Paris, Texas's hottest temperature in 2024 from extremeweatherwatch.com. The search results show data for Paris, France in 2024 and Paris, Texas for 2023, but not the specific 2024 data for Paris, Texas that was requested.

```json
{
    "answer": "Information not found",
    "url": "No source available"
}
```





Unnamed: 0,survey_response,link1,json,answer,url
0,"Monterey, CA","{\n ""answer"": ""90°F"",\n ""url"": ""https://www....","{""answer"": ""90\u00b0F"", ""url"": ""https://www.ex...",90°F,https://www.extremeweatherwatch.com/cities/mon...
1,"San Diego, CA","{\n ""answer"": ""94°F"",\n ""url"": ""https://www....","{""answer"": ""94\u00b0F"", ""url"": ""https://www.ex...",94°F,https://www.extremeweatherwatch.com/cities/san...
2,"Paris, Texas in United States","Based on my search results, I was unable to fi...","{""answer"": ""Information not found"", ""url"": ""No...",Information not found,No source available


Google actually finds Paris Texas!

In [None]:
list_names = ["Monterey, CA", "San Diego, CA", "Paris, Texas"]
list_names_specific = ["Monterey, CA", "San Diego, CA", "Paris, Texas in United States"]

test2 = cat.build_web_research_dataset(
    search_question="Hottest temperature in 2024 from extremeweatherwatch?",
    search_input=list_names_specific,
    api_key=os.getenv("GOOGLE_API_KEY"),
    answer_format="just the temperature in Fahrenheit",
    time_delay=15,
    model_source="Google",
    user_model="gemini-2.5-flash" #older models don't seem to work with google search tools
)

test2

1. Answer
2. URL


Building dataset: 100%|██████████| 3/3 [00:42<00:00, 14.07s/it]


Unnamed: 0,survey_response,link1,json,answer,url
0,"Monterey, CA","```json\n{\n""answer"": ""90 °F"",\n""url"": ""https:...","{""answer"": ""90 \u00b0F"", ""url"": ""https://verte...",90 °F,https://vertexaisearch.cloud.google.com/ground...
1,"San Diego, CA","```json\n{\n""answer"": ""94 °F"",\n""url"": ""https:...","{""answer"": ""94 \u00b0F"", ""url"": ""https://verte...",94 °F,https://vertexaisearch.cloud.google.com/ground...
2,"Paris, Texas in United States","```json\n{\n""answer"": ""107 °F"",\n""url"": ""https...","{""answer"": ""107 \u00b0F"", ""url"": ""https://vert...",107 °F,https://vertexaisearch.cloud.google.com/ground...


Google correctly finds me

In [None]:
list_names = ["Chris Soria", "Matthew Stenberg", "Sara Quigley"]

test1 = cat.build_web_research_dataset(
    search_question="Academic Department they belong to at UC Berkeley?",
    search_input=list_names,
    api_key=os.getenv("GOOGLE_API_KEY"),
    answer_format="just the department name",
    time_delay=10,
    model_source="Google",
    user_model="gemini-2.5-flash" #older models don't seem to work with google search tools
)
test1.head()

1. Answer
2. URL


Building dataset: 100%|██████████| 3/3 [00:32<00:00, 10.82s/it]


Unnamed: 0,survey_response,link1,json,answer,url
0,Chris Soria,"```json\n{\n ""answer"": ""Demography"",\n ""url"": ...","{""answer"": ""Demography"", ""url"": ""https://verte...",Demography,https://vertexaisearch.cloud.google.com/ground...
1,Matthew Stenberg,"```json\n{\n""answer"": ""Political Science"",\n""u...","{""answer"": ""Political Science"", ""url"": ""https:...",Political Science,https://politicalscience.berkeley.edu/people/1...
2,Sara Quigley,"```json\n{\n""answer"": ""Information not found"",...","{""answer"": ""Information not found"", ""url"": ""No...",Information not found,No source available


Does as well as Anthropic

In [None]:
list_names = ["Francoise Sorgen", "Ayesha Mahmud", "Dennis Feehan"]

test3 = cat.build_web_research_dataset(
    search_question="Where these UC Berkeley professors got their PhD according to Linkedin?",
    search_input=list_names,
    api_key=os.getenv("GOOGLE_API_KEY"),
    answer_format="just the school name",
    time_delay=15,
    model_source="Google",
    user_model="gemini-2.5-flash", #older models don't seem to work with google search tools
    safety=True,
    additional_instructions="Do not give me information for any degree that's not a PhD degree. If you can't find that they have a PhD, just say 'Not found'" # extra instruction to improve accuracy
)

test3

1. Answer
2. URL


Building dataset: 100%|██████████| 3/3 [00:50<00:00, 16.92s/it]


Unnamed: 0,survey_response,link1,json,answer,url
0,Francoise Sorgen,"```json\n{\n""answer"": ""Information not found"",...","{""answer"": ""Information not found"", ""url"": ""No...",Information not found,No source available
1,Ayesha Mahmud,"```json\n{\n""answer"": ""Princeton University"",\...","{""answer"": ""Princeton University"", ""url"": ""htt...",Princeton University,https://vertexaisearch.cloud.google.com/ground...
2,Dennis Feehan,"```json\n{\n""answer"": ""Princeton University"",\...","{""answer"": ""Princeton University"", ""url"": ""htt...",Princeton University,https://vertexaisearch.cloud.google.com/ground...
