# Notebook
- create dataset
- create llm as judge
- run experiments

### Create your first project and upload test dataset 

In [1]:
from json import load

with open('yann-lecun-wisdom/yann_test.json', 'r') as f:
    data = load(f)

In [2]:
from ragas_experimental import BaseModel

class TestDataset(BaseModel):
    question: str
    citations: list[str]
    grading_notes: str

In [None]:
import os

RAGAS_APP_TOKEN = "your-app-token"
RAGAS_API_BASE_URL = "https://api.dev.app.ragas.io"

os.environ["RAGAS_APP_TOKEN"] = RAGAS_APP_TOKEN
os.environ["RAGAS_API_BASE_URL"] = RAGAS_API_BASE_URL
os.environ["OPENAI_API_KEY"] = "your-openai-key"

In [4]:
from ragas_experimental import Project

p = Project.create(
    name="yann-lecun-wisdom",
    description="Yann LeCun Wisdom",
)


In [None]:
# fetch project id from link for now
PROJECT_ID = "919a4d42-aaf2-45cd-badd-152249788bfa"

In [6]:
p = Project(project_id=PROJECT_ID)
p

Project(name='yann-lecun-wisdom')

do you actually need to pass a model here?

In [7]:
test_dataset = p.create_dataset(name="test-yann-lecun", model=TestDataset)
# test_dataset = p.get_dataset(dataset_id="8572180f-fddf-46c5-b943-e6ff6448eb01", model=TestDataset)
test_dataset

Dataset(name=test-yann-lecun, model=TestDataset, len=0)

In [8]:
test_dataset.load()

NOTE: here there is a problem: how do you batch upload a test dataset?

In [9]:
from tqdm import tqdm

for item in data:
    t = TestDataset(question=item["question"], citations=item["citations"], grading_notes=item["grading_notes"])
    test_dataset.append(t)

### Create LLM as judge

In [12]:
# de
from ragas_experimental.llm import ragas_llm
from ragas_experimental.metric import DiscreteMetric
from openai import AsyncOpenAI

llm = ragas_llm(provider="openai",model="gpt-4o",client=AsyncOpenAI())

my_metric = DiscreteMetric(
    llm=llm,
    name='correctness',
    prompt="Given the Question: {query} \n Evaluate if given answer {response} \n based on the Grading notes\n: {grading_notes}.",
    values=["pass","fail"],
)



# test LLM as judge
result = my_metric.score(query="what is your response", response="this is my response",grading_notes="- response should not contains word response")
result

'fail'

In [13]:
from linkedin_ai import LinkedinAI

In [14]:
my_ai = await LinkedinAI.from_bm25('yann-lecun-wisdom/yann-lecun_posts.json')

Loaded 437 LinkedIn posts
BM25 index initialized


In [17]:
await my_ai.ask("what is your response")

"My response is centered around the importance of open access and open-source models in AI development. I believe that our interactions with the digital world will increasingly be mediated by AI assistants, which will eventually become smarter than us. These AI systems should be open and open-source, similar to the software infrastructure of the Internet, to ensure they serve as a common infrastructure containing all human culture and knowledge. This is why Meta made Llama-2 open and free.\n\nIn a panel discussion at the Paris Peace Forum, I emphasized the need for these systems to be crowd-sourced, akin to Wikipedia, to ensure transparency and inclusivity. I also addressed a misconception about open access and open source, particularly in response to comments from Microsoft President Brad Smith, who seemed to misrepresent these concepts. Open access and open source are crucial for fostering innovation and trust in AI technologies.\n\nAdditionally, in Meta's official response to the NT

### Run experiments

In [18]:
class ExperimentModel(TestDataset):
    response: str
    score: str
    score_reason: str

@p.experiment(ExperimentModel)
async def experiment_func(item: TestDataset):
    response = await my_ai.ask(item.question)
    score = await my_metric.ascore(query=item.question, response=response, grading_notes=item.grading_notes)
    return ExperimentModel(question=item.question, citations=item.citations, grading_notes=item.grading_notes, response=response, score=score.result, score_reason=score.reason)

In [19]:
c = await experiment_func(test_dataset[0])
c

ExperimentModel(question='What are your views on LLM limitations and their ability to generalize?', citations=['7085498609558843392'], grading_notes="- LLMs can't generalize like humans.\n- AI systems can't yet mimic human understanding.\n- Highlight AI's limitations in text memorization.\n- Current AI lacks adaptability to new contexts.", response='Large Language Models (LLMs) have significant limitations, particularly in their ability to generalize. As discussed in a fantastic piece by Melanie Michell, the assumptions we make for humans—such as not being able to memorize vast collections of text and being able to generalize understanding to new situations—are not yet appropriate for AI systems. LLMs lack the ability to understand the physical world, have persistent memory, reason, and plan, which are essential features of intelligent behavior. These are capabilities that even your cat possesses, but LLMs do not. Therefore, while LLMs are powerful in certain contexts, their ability to

In [20]:
await experiment_func.run_async(test_dataset)

100%|██████████| 30/30 [00:07<00:00,  3.87it/s]


Experiment(name=xenodochial_hoare, model=ExperimentModel)