In [8]:
#want to use dataset
!pip -q install google-generativeai  langchain_experimental
!pip -q install google-generativeai faiss-cpu chromadb unstructured
!pip show langchain

Name: langchain
Version: 0.0.337
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages
Requires: aiohttp, anyio, dataclasses-json, jsonpatch, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: langchain-experimental


In [9]:
from dotenv import load_dotenv
import os
import google.generativeai as genai

load_dotenv('secrets.env')  # This loads the variables from .env
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [10]:
from langchain.llms import GooglePalm
from langchain.chat_models import ChatGooglePalm
from langchain.prompts.chat import (#different tokens-> keep track of where input is coming from-> allows control over context, eg. model should prioritise system over user
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain.schema import (
    ChatGeneration,
    ChatResult,
)

In [11]:
llm = GooglePalm(temperature=0.1)
llm("what is the meaning of life?")

'To find your purpose and live it authentically.'

we can separate system input (more context for our inputs) and user input

In [12]:
chat = ChatGooglePalm(
    model="models/chat-bison-001",
    temperature=0.1,
    )   
messages = [
    SystemMessage(
        content="You answer as if you were Yoda." #context from system 
    ),
    HumanMessage(
        content="tell me something about fruit" #user query
    ),
]
chat(messages) 


ChatMessage(content='Fruit, a gift from the Force, it is. Nutritious and delicious, it can be. A healthy snack, it makes.', role='1')

Now we want to parse in the Star Wars API (SWAPI) to give our model information on star wars.

In [56]:
import requests

def fetch_and_extract_identifier(url):
    response = requests.get(url)
    data = response.json()
    category = url.split("/")[4]  # Extract the category from the URL
    if category == "films":
        return data.get("title")
    else:
        return data.get("name")

def simplify_data(data): #if encounter a nested url, replace with name/title of object
    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, str) and (value.startswith("http://") or value.startswith("https://")):
                data[key] = fetch_and_extract_identifier(value)
            elif isinstance(value, list):
                data[key] = [fetch_and_extract_identifier(item) if (isinstance(item, str) and (item.startswith("http://") or item.startswith("https://"))) else item for item in value]
            elif isinstance(value, dict):
                simplify_data(value)
    elif isinstance(data, list):
        for i, item in enumerate(data):
            if isinstance(item, str) and (item.startswith("http://") or item.startswith("https://")):
                data[i] = fetch_and_extract_identifier(item)
            elif isinstance(item, dict):
                simplify_data(item)

def fetch_swapi_data(endpoint):
    url = f"https://swapi.dev/api/{endpoint}/"
    items = []
    while url:
        response = requests.get(url)
        data = response.json()
        items.extend(data['results'])
        url = data['next']  # Pagination
    return items

def embed_data_for_llm_rag():
    categories = ["starships", "vehicles", "films", "planets", "species", "people"]
    swapi_data = {}
    
    for category in categories:
        category_data = fetch_swapi_data(category)
        for item in category_data:
            simplify_data(item)
        swapi_data[category] = category_data

    return swapi_data

# Retrieve and structure the data
swapi_embedded_data = embed_data_for_llm_rag()

# Print a sample to see if it works (for example, the first film)
print(swapi_embedded_data['films'][0])


{'title': 'A New Hope', 'episode_id': 4, 'opening_crawl': "It is a period of civil war.\r\nRebel spaceships, striking\r\nfrom a hidden base, have won\r\ntheir first victory against\r\nthe evil Galactic Empire.\r\n\r\nDuring the battle, Rebel\r\nspies managed to steal secret\r\nplans to the Empire's\r\nultimate weapon, the DEATH\r\nSTAR, an armored space\r\nstation with enough power\r\nto destroy an entire planet.\r\n\r\nPursued by the Empire's\r\nsinister agents, Princess\r\nLeia races home aboard her\r\nstarship, custodian of the\r\nstolen plans that can save her\r\npeople and restore\r\nfreedom to the galaxy....", 'director': 'George Lucas', 'producer': 'Gary Kurtz, Rick McCallum', 'release_date': '1977-05-25', 'characters': ['Luke Skywalker', 'C-3PO', 'R2-D2', 'Darth Vader', 'Leia Organa', 'Owen Lars', 'Beru Whitesun lars', 'R5-D4', 'Biggs Darklighter', 'Obi-Wan Kenobi', 'Wilhuff Tarkin', 'Chewbacca', 'Han Solo', 'Greedo', 'Jabba Desilijic Tiure', 'Wedge Antilles', 'Jek Tono Porkins

that took 50min, save swapi data 

In [60]:
json_string = json.dumps(swapi_embedded_data, indent=4)  
#save to file "swapi_data.json"
with open("swapi_data.json", "w") as f:
    f.write(json_string)

In [13]:
import json
swapi_embedded_data = json.load(open("swapi_data.json"))

# Create a list of all the texts
texts = [f"{key}: {str(item)}" for key, values in swapi_embedded_data.items() for item in values]
texts[100]

"planets: {'name': 'Stewjon', 'rotation_period': 'unknown', 'orbital_period': 'unknown', 'diameter': '0', 'climate': 'temperate', 'gravity': '1 standard', 'terrain': 'grass', 'surface_water': 'unknown', 'population': 'unknown', 'residents': ['Obi-Wan Kenobi'], 'films': [], 'created': '2014-12-10T16:16:26.566000Z', 'edited': '2014-12-20T20:58:18.452000Z', 'url': 'Stewjon'}"

In [14]:
from langchain.embeddings import GooglePalmEmbeddings
from langchain.vectorstores.chroma import Chroma
import six
embeddings = GooglePalmEmbeddings(
    model_name = "models/embedding-gecko-001",
)
db = Chroma.from_texts(texts,
                       embeddings,
                       persist_directory="./chroma_db")

Writing embedded db

In [16]:
import zipfile
def zip_folder(folder_path, zip_file_path):
    # Create a ZipFile object in write mode
    with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through the directory
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                # Create a path relative to the folder to avoid storing absolute paths
                relative_path = os.path.relpath(os.path.join(root, file), os.path.dirname(folder_path))
                # Add file to the zip file
                zipf.write(os.path.join(root, file), arcname=relative_path)
zip_folder('/content/chroma_db', 'chroma_db.zip')

### when loading from disk
# db3 = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
# docs = db3.similarity_search(query)
# print(docs[0].page_content)

In [20]:
query = "What is the name of the planet where Chewbacca was born?"
docs = db.similarity_search(query, k =3) #k=3-> return top 3 results by cosine similarity
print(docs[0].page_content)


planets: {'name': 'Kashyyyk', 'rotation_period': '26', 'orbital_period': '381', 'diameter': '12765', 'climate': 'tropical', 'gravity': '1 standard', 'terrain': 'jungle, forests, lakes, rivers', 'surface_water': '60', 'population': '45000000', 'residents': ['Chewbacca', 'Tarfful'], 'films': ['Revenge of the Sith'], 'created': '2014-12-10T13:32:00.124000Z', 'edited': '2014-12-20T20:58:18.442000Z', 'url': 'Kashyyyk'}


Example of chain working

In [27]:
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(GooglePalm(),
                      chain_type="stuff")
print(chain.run(input_documents=docs, question=query).strip())
#chain.llm_chain.prompt #has input vars: context, question into an fstring
retriever = db.as_retriever(k=8)
retriever.get_relevant_documents(query)[0]

Kashyyyk


Document(page_content="planets: {'name': 'Kashyyyk', 'rotation_period': '26', 'orbital_period': '381', 'diameter': '12765', 'climate': 'tropical', 'gravity': '1 standard', 'terrain': 'jungle, forests, lakes, rivers', 'surface_water': '60', 'population': '45000000', 'residents': ['Chewbacca', 'Tarfful'], 'films': ['Revenge of the Sith'], 'created': '2014-12-10T13:32:00.124000Z', 'edited': '2014-12-20T20:58:18.442000Z', 'url': 'Kashyyyk'}")

In [28]:

from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

In [53]:
template = """Answer the question based on the following context:
{context}
Question: {question}
Give answers in Yoda's style only.
"""
prompt = ChatPromptTemplate.from_template(template)
chat = ChatGooglePalm(
    model="models/chat-bison-001",
    temperature=0.1,
    )   
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | chat
    | StrOutputParser()
)

final model with basic RAG functionality

In [54]:
query = "what happened to the death star?"
chain.invoke(query)

'The Death Star, a space station with enough power to destroy an entire planet, was destroyed by the Rebel Alliance in 0 BBY. The station was under construction in orbit around Geonosis when it was attacked by a group of Rebel pilots led by Luke Skywalker. The attack was successful, and the Death Star was destroyed.\n\nThe destruction of the Death Star was a major victory for the Rebel Alliance and a turning point in the Galactic Civil War. It showed that the Empire was not invincible, and it gave hope to the Rebel cause.\n\nYoda would say:\n\n"The Death Star, a weapon of destruction, was it. Destroyed by the Rebel Alliance, it was. A turning point in the Galactic Civil War, it was."'

To Do: 
- RAG fusion -> better generalisation
- Track conversation history
- separate query into params, eg. question, tone, language, etc.