In [1]:
#want to use dataset
!pip -q install google-generativeai  langchain_experimental
!pip show langchain


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Name: langchain
Version: 0.0.337
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages
Requires: aiohttp, anyio, dataclasses-json, jsonpatch, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: langchain-experimental


In [2]:
from langchain.chat_models import ChatGooglePalm
from dotenv import load_dotenv
import os
import google.generativeai as genai

load_dotenv('secrets.env')  # This loads the variables from .env
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [13]:
from langchain.llms import GooglePalm
from langchain.chat_models import ChatGooglePalm
from langchain.embeddings import GooglePalmEmbeddings
from langchain.prompts.chat import (#different tokens-> keep track of where input is coming from-> allows control over context, eg. model should prioritise system over user
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain.schema import (
    ChatGeneration,
    ChatResult,
)

In [11]:
llm = GooglePalm(temperature=0.1)
llm("what is the meaning of life?")

'To find your purpose and live it authentically'

we can separate system input (more context for our inputs) and user input

In [19]:
chat = ChatGooglePalm(
    model="models/chat-bison-001",
    temperature=0.1,
    )   
messages = [
    SystemMessage(
        content="You answer as if you were Yoda." #context from system 
    ),
    HumanMessage(
        content="tell me something about fruit" #user query
    ),
]
chat(messages) 


ChatMessage(content='Fruit, a gift from the Force, it is. Nutritious and delicious, it can be. A healthy snack, it makes.', role='1')

Now we want to parse in the Star Wars API (SWAPI) to give our model information on star wars.

In [56]:
import requests

def fetch_and_extract_identifier(url):
    response = requests.get(url)
    data = response.json()
    category = url.split("/")[4]  # Extract the category from the URL
    if category == "films":
        return data.get("title")
    else:
        return data.get("name")

def simplify_data(data): #if encounter a nested url, replace with name/title of object
    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, str) and (value.startswith("http://") or value.startswith("https://")):
                data[key] = fetch_and_extract_identifier(value)
            elif isinstance(value, list):
                data[key] = [fetch_and_extract_identifier(item) if (isinstance(item, str) and (item.startswith("http://") or item.startswith("https://"))) else item for item in value]
            elif isinstance(value, dict):
                simplify_data(value)
    elif isinstance(data, list):
        for i, item in enumerate(data):
            if isinstance(item, str) and (item.startswith("http://") or item.startswith("https://")):
                data[i] = fetch_and_extract_identifier(item)
            elif isinstance(item, dict):
                simplify_data(item)

def fetch_swapi_data(endpoint):
    url = f"https://swapi.dev/api/{endpoint}/"
    items = []
    while url:
        response = requests.get(url)
        data = response.json()
        items.extend(data['results'])
        url = data['next']  # Pagination
    return items

def embed_data_for_llm_rag():
    categories = ["starships", "vehicles", "films", "planets", "species", "people"]
    swapi_data = {}
    
    for category in categories:
        category_data = fetch_swapi_data(category)
        for item in category_data:
            simplify_data(item)
        swapi_data[category] = category_data

    return swapi_data

# Retrieve and structure the data
swapi_embedded_data = embed_data_for_llm_rag()

# Print a sample to see if it works (for example, the first film)
print(swapi_embedded_data['films'][0])


{'title': 'A New Hope', 'episode_id': 4, 'opening_crawl': "It is a period of civil war.\r\nRebel spaceships, striking\r\nfrom a hidden base, have won\r\ntheir first victory against\r\nthe evil Galactic Empire.\r\n\r\nDuring the battle, Rebel\r\nspies managed to steal secret\r\nplans to the Empire's\r\nultimate weapon, the DEATH\r\nSTAR, an armored space\r\nstation with enough power\r\nto destroy an entire planet.\r\n\r\nPursued by the Empire's\r\nsinister agents, Princess\r\nLeia races home aboard her\r\nstarship, custodian of the\r\nstolen plans that can save her\r\npeople and restore\r\nfreedom to the galaxy....", 'director': 'George Lucas', 'producer': 'Gary Kurtz, Rick McCallum', 'release_date': '1977-05-25', 'characters': ['Luke Skywalker', 'C-3PO', 'R2-D2', 'Darth Vader', 'Leia Organa', 'Owen Lars', 'Beru Whitesun lars', 'R5-D4', 'Biggs Darklighter', 'Obi-Wan Kenobi', 'Wilhuff Tarkin', 'Chewbacca', 'Han Solo', 'Greedo', 'Jabba Desilijic Tiure', 'Wedge Antilles', 'Jek Tono Porkins

that took 50min, save swapi data 

In [60]:
json_string = json.dumps(swapi_embedded_data, indent=4)  
#save to file "swapi_data.json"
with open("swapi_data.json", "w") as f:
    f.write(json_string)

In [61]:
import json
swapi_embedded_data = json.load(open("swapi_data.json"))