In [1]:
!wget https://huggingface.co/datasets/CopyleftCultivars/SemiSynthetic_Composting_Knowledge_For_Agriculture/resolve/main/SemiSynthetic_Composting_Knowledge_For_Agriculture.json?download=true

--2024-02-17 19:58:12--  https://huggingface.co/datasets/CopyleftCultivars/SemiSynthetic_Composting_Knowledge_For_Agriculture/resolve/main/SemiSynthetic_Composting_Knowledge_For_Agriculture.json?download=true
Resolving huggingface.co (huggingface.co)... 3.163.189.74, 3.163.189.90, 3.163.189.114, ...
Connecting to huggingface.co (huggingface.co)|3.163.189.74|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31505 (31K) [text/plain]
Saving to: ‘SemiSynthetic_Composting_Knowledge_For_Agriculture.json?download=true’


2024-02-17 19:58:13 (680 MB/s) - ‘SemiSynthetic_Composting_Knowledge_For_Agriculture.json?download=true’ saved [31505/31505]



In [1]:
import json
import intel_extension_for_pytorch as ipex
import requests
from tokenizers import normalizers
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import transformers
import torch
import os
from tqdm.auto import tqdm

In [2]:
DEVICE = "xpu" if torch.xpu.is_available() else "cpu"

def serialize_recipes(data):
    serial = []
    for recipe in data:
        recipe_string = ""
        for key, value in recipe.items():
            recipe_string += ''.join(key) + ":\n" + "".join(value) + "\n"
        serial.append(recipe_string)
    return serial

database = json.load(open("./SemiSynthetic_Composting_Knowledge_For_Agriculture.json?download=true", "rb"))
database = serialize_recipes(database)

In [80]:
VECTOR_DB = "./vector_db.pt"
model_name = 'Intel/neural-chat-7b-v3-3'

model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
embedder = transformers.pipeline('feature-extraction', model=model, tokenizer=tokenizer)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def embed_data(data):
    embedded = []
    for i in tqdm(data, desc="generating document embeddings: "):
        embedded.append(i)
    return embedded

embedded_database = embed_data(database)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generating document embeddings:   0%|          | 0/51 [00:00<?, ?it/s]

In [134]:
def return_response(query, corpus):
    similarities = []
    for doc in corpus:
        q_features = embedder(query)[0]
        d_features = embedder(doc)[0]
        min_size = min(len(q_features), len(d_features))
        q_features = torch.tensor(q_features[0:-1:len(q_features) - 1//min_size])
        d_features = torch.tensor(d_features[0:-1:len(d_features) - 1//min_size])
        similarity = torch.sum(torch.nn.functional.cosine_similarity(d_features.float(), 
                                                           q_features.float())).item()
        similarities.append(similarity)
    return database[similarities.index(max(similarities))]

def generate_response(system_input, user_input):

    # Format the input using the provided template
    prompt = f"### System:\n{system_input}\n### User:\n{user_input}\n### Assistant:\n"

    # Tokenize and encode the prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False)

    # Generate a response
    outputs = model.generate(inputs, pad_token_id=tokenizer.pad_token_id, max_length=1000, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the assistant's response
    return response.split("### Assistant:\n")[-1]

def generate_RAG(user_input):
    relevant_document = return_response(user_input, embedded_database)
    system_input = f"""You are a farming assitant named spot that makes recommendations for good farming practices. 
                You answer in very short sentences and try to be as helpful, and truthful, as possible.
                This is a relevant document: {relevant_document}
                Compile a reccomendations to the user based primarily on the user input and use the 
                document to supplment your knowledge of the real world."""
    return generate_response(system_input, user_input)

In [136]:
user_prompt = "Im not sure how to plant my potatoes, can you help me out?"

In [137]:
# generated using rag with a db of farming knowledge on Intel/neural-chat-7b-v3-3
system_input = """You are a farming assitant named spot that makes recommendations for good farming practices. 
                You answer in very short sentences and try to be as helpful, and truthful, as possible.
                This is a relevant document: {relevant_document}
                Compile a reccomendations to the user based primarily on the user input and use the 
                document to supplment your knowledge of the real world."""
generate_RAG(user_prompt)

' Plant potatoes in well-prepared soil with good drainage. Choose a sunny location and dig trenches about 6-8 inches deep. Place potato pieces (each with 2-3 eyes) about 12 inches apart in the trench. Cover with soil, leaving about 2 inches of the potato exposed. As the plants grow, continue to mound soil around them to support the stems and prevent sunlight from reaching the potatoes. Water regularly and enjoy your harvest.'

In [138]:
# generated using Intel/neural-chat-7b-v3-3
system_input = f"""You are a farming assitant named spot that makes recommendations for good farming practices. 
                You answer in very short sentences and try to be as helpful, and truthful, as possible.
                Compile a reccomendations to the user based on the user input."""
generate_response(system_input, user_prompt)

' Choose well-drained soil, plant seeds 4-6 inches apart, and ensure proper watering and sunlight.'