In [None]:
# SetList api saved to file
%reload_ext autoreload
%autoreload 2

from setlist_app import SetListApp

app = SetListApp("tmG3-KHsciHD1mS5y58b1FIv5NMPccWTKN8E")

for iter in range(10, 11): 
    r = await app.get_setList(artistName="Taylor Swift", p=iter)
    data = await r.text()
    fileName = "./data/get_data" + str(iter) + ".json"
    with open(fileName, "w") as f:
        # Write the JSON data to the file
        f.write(data)
#print(f"{await r.text()}")


In [None]:
# Create document loader(s)

from langchain.document_loaders import JSONLoader
from llama_index import Document
import datetime, re
from pprint import pprint 

import unicodedata
def remove_nonascii(s: str):
    return str(unicodedata.normalize("NFKD", s)).encode('ascii','ignore').decode('ascii')

def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["event date"] = remove_nonascii(record["eventDate"])
    metadata["venue name"] = remove_nonascii(record["venue"].get("name"))
    metadata["city"] = remove_nonascii(record["venue"].get("city").get("name"))
    metadata["tour name"] = remove_nonascii(record["tour"].get("name"))
    metadata["artist name"] = remove_nonascii(record["artist"].get("name"))

    songs = []    
    for e in record["sets"].get("set"):
        for s in e.get("song"):
            if (s.get("name")):
                songs.append(remove_nonascii(s.get("name")))
    
    metadata["songs played"] = ", ".join(songs)       
    return metadata

loader = JSONLoader(file_path='./data/get_data1.json', 
                    jq_schema='.setlist[]',
                    content_key='sets',
                    text_content=False,
                    metadata_func=metadata_func)
lcDocuments = loader.load()
#pprint(lcDocuments)

lDocs = []
for doc in lcDocuments:
    d = Document.from_langchain_format(doc)
    dArray = re.split('-', doc.metadata["event date"])
    formattedDate =  datetime.date(int(dArray[2]), int(dArray[1]), int(dArray[0])).strftime('%B %d, %Y')    
    template = """On {date}, the artist {artist} played in {city} at the venue {venue} to support the tour '{tour}'. \
The artist played the following songs during the concert:  {setlist}"""
    d.text = template.format(date=formattedDate, artist=doc.metadata["artist name"], city=doc.metadata["city"], venue=doc.metadata["venue name"], 
                             tour=doc.metadata["tour name"], setlist=doc.metadata["songs played"])
    lDocs.append(d)

pprint(lDocs)    

In [None]:
# Create index using loader
%reload_ext autoreload
%autoreload 2

import logging, sys

from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.embeddings.huggingface import HuggingFaceInstructEmbeddings

from llama_index.node_parser import SimpleNodeParser
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index import (GPTVectorStoreIndex,LLMPredictor,ServiceContext)

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

embed_model = LangchainEmbedding(HuggingFaceInstructEmbeddings(model_kwargs = {'device': 'cpu'}, model_name="sentence-transformers/all-mpnet-base-v2"))
model_path = '../models/ggml-gpt4all-j-v1.3-groovy.bin'
llm = GPT4All(model=model_path, backend='gptj', callbacks=[StreamingStdOutCallbackHandler()], n_batch=64, streaming=False, n_ctx=64, n_threads=8, verbose=True)
gpt4all_lm_predictor = LLMPredictor(llm=llm)
service_context = ServiceContext.from_defaults(
    llm_predictor=gpt4all_lm_predictor,
    embed_model=embed_model,
    node_parser=SimpleNodeParser(text_splitter=TokenTextSplitter(chunk_size=512, chunk_overlap=32))
)
index = GPTVectorStoreIndex.from_documents(lDocs, service_context=service_context)
index.storage_context.persist()

#query_engine = index.as_query_engine(streaming=True, similarity_top_k=1, service_context=service_context)
#response_stream = query_engine.query("How many times was 'Cruel Summer' played in a set list?")
#response_stream.print_response_stream()

In [None]:
#use cached index to run queries
%reload_ext autoreload
%autoreload 2

from langchain.agents import initialize_agent, Tool
from langchain.llms import GPT4All
from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceInstructEmbeddings
import logging, sys

from llama_index.node_parser import SimpleNodeParser
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from langchain.cache import SQLiteCache
from llama_index import (
    GPTVectorStoreIndex,
    load_index_from_storage,
    LLMPredictor,
    PromptHelper,
    StorageContext,
    ServiceContext
)

tools = [
    Tool(
        name="Swifty chat bot",
        func=lambda q: query_engine.query(q),
        description=f"Useful when you want answer questions about the set list Documents."
    )
]

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])
embed_model = LangchainEmbedding(HuggingFaceInstructEmbeddings(model_kwargs = {'device': 'cpu'}, model_name="sentence-transformers/all-mpnet-base-v2"))
model_path = '../models/ggml-gpt4all-j-v1.3-groovy.bin'
gpt4all_lm_predictor = LLMPredictor(
    cache=SQLiteCache(database_path=".langchain.db"),
    llm=GPT4All(model=model_path, backend='gptj', temp=0.8, n_batch=24, callbacks=[StreamingStdOutCallbackHandler()], streaming=False, n_ctx=1024, n_threads=4, verbose=False))
service_context = ServiceContext.from_defaults(llm_predictor=gpt4all_lm_predictor, embed_model=embed_model,
    node_parser=SimpleNodeParser(text_splitter=TokenTextSplitter(chunk_size=512, chunk_overlap=32)))
index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./storage"), service_context=service_context) 
query_engine = index.as_query_engine(streaming=False, similarity_top_k=2, service_context=service_context, verbose=False)
#print(query_engine.query("Where did Taylor Swift play most recently?"))
print(query_engine.query("What songs did Taylor Swift play in Atlanta?"))
# r.print_response_stream()


#
#agent_chain = initialize_agent(
#    tools, 
#    llm, 
#    agent="zero-shot-react-description", 
#    memory=ConversationBufferMemory(memory_key="chat_history")
#)

#retriever = index.as_retriever(service_context=service_context)
#query_engine = RetrieverQueryEngine.from_args(retriever, response_mode='no_text')  
#response = query_engine.query("Give me a list of all the good and the bad things in the text.")