In [38]:
import torch

In [1]:
import json

In [2]:
# ML imports
from llama_cpp import Llama
# from llama_index.llms import LlamaCPP

from llama_index import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
    Document,
)

from llama_index import StorageContext, load_index_from_storage
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

from llama_index import set_global_tokenizer
from llama_index.embeddings import HuggingFaceEmbedding
from transformers import AutoTokenizer
from llama_index import Prompt

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
set_global_tokenizer(
    AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode
)

In [4]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [5]:
llm = Llama(model_path="/Users/connorparish/external_code/llama.cpp/models/llama-2-13b.Q5_K_M.gguf", 
            chat_format="llama-2", n_ctx=4096)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /Users/connorparish/external_code/llama.cpp/models/llama-2-13b.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q5_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q

In [6]:
# llm = LlamaCPP(model_path="/Users/connorparish/external_code/llama.cpp/models/llama-2-13b.Q5_K_M.gguf", 
#             model_kwargs={"n_gpu_layers": 1},
#             temperature=0.1,
#             max_new_tokens=256,
#             # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
#             context_window=4096,
#             # kwargs to pass to __call__()
#             generate_kwargs={},
#             messages_to_prompt=messages_to_prompt,
#             completion_to_prompt=completion_to_prompt,
#             verbose=True,)

In [7]:
# response = llm.complete("Who is George Clooney?")
# print(response.text)


# Create index from text messages

In [9]:
with open('../data/Connor_phone_SMS_20230923121617.json', 'r') as infile:
    thread_to_texts = json.load(infile)

In [10]:
# Convert threads to easily readible conversations strs
def thread_messages_to_str(thread_list):
    conv_str = ""
    for message in thread_list:
        conv_str += f"{message['from']}: {message['Content']}\n"
    return conv_str

In [10]:
# Convert threads to documents
thread_documents = list()
for threadid, messages in thread_to_texts.items():
    from_list = list({m['from'] for m in messages})
    conv_str = thread_messages_to_str(messages)
    thread_documents.append(Document(text=conv_str, doc_id=threadid, 
                                     metadata={'threadid' : threadid, 'thread_members' : from_list},
                                     excluded_llm_metadata_keys=['doc_id', 'threadid', 'thread_members']))

In [11]:
# create a service context
# service_context = ServiceContext.from_defaults(
#     embed_model=embed_model,
#     llm=None,
# )

# service_context = ServiceContext.from_defaults(
#     embed_model=embed_model,
#     llm=llm,
# )

LLM is explicitly disabled. Using MockLLM.


In [12]:
# index = VectorStoreIndex.from_documents(
#     thread_documents, service_context=service_context,
#     show_progress=True,
# )

In [11]:
# Load saved index
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=None)
context = StorageContext.from_defaults(persist_dir="../data/thread_docs_index")
stored_index = load_index_from_storage(context, service_context=service_context)

LLM is explicitly disabled. Using MockLLM.


# Set up prompting

In [12]:
retriever = stored_index.as_retriever(similarity_top_k=10)

In [64]:
prompt_generating_prompt = (
    """Write a prmopt to feed an LLM bot that is tasked with extrating information from a long text conversation.
    The conversation is longer than the context window, so the bot will need to decide which information to extract
    from each chunk of conversation to feed into the next. Once all chunks have been read the both will be tasked
    with answering the question. The question is: What is Greg Prentiss personality type?
    Prompt: """
)

In [None]:
query = "What is Greg Prentiss' personality type?"

In [70]:
conversation_extration_prompt = (
    """You are a helpful bot that is tasked with extrating information from a long text conversation.\n
    Conversation:\n
    "{context_str}"
    Based off of the conversation provided above what information is useful for answering the 
    question: {query_str}\n
    Answer:"""
)

In [73]:
# Define a custom prompt
# template = (
#     "You are a helpful bot tasked with answering questions based on a question between friends provided below\n"
#     "---------------------\n"
#     "{context_str}"
#     "\n---------------------\n"
#     "Given this information, please answer the question: {query_str}\n"
#     "Answer: "
# )
# template = (
#     "Context information is below.\n"
#     "---------------------\n"
#     "{context_str}\n"
#     "---------------------\n"
#     "Given the context information and not prior knowledge, "
#     "answer the query.\n"
#     "Query: {query_str}\n"
#     "Answer: "
# )
qa_template = Prompt(conversation_extration_prompt)
max_prompt_word_len = 2100

query = "What is Greg Prentiss' personality type?"

contexts = retriever.retrieve(query)
context_list = [context.get_content() for context in contexts]
context_str = ""
context_str_words = 0
i = 0
while i < len(context_list) and context_str_words + len(context_list[i].split()) < max_prompt_word_len:
    context_str += "\n-start of conversation-\n"
    context_str += context_list[i]
    context_str_words = len(context_str.split())
    i += 1
    
# Use the custom prompt when querying
prompt = qa_template.format(context_str=context_str, query_str=query)
print(prompt)

You are a helpful bot that is tasked with extrating information from a long text conversation.

    Conversation:

    "when u get money u get introduce to another tier of women 
Greg Prentiss: so it’s hard
Jason Anderson: laughed at “but the. when u get money u get introduce to another tier of women ”
Jason Anderson: hahahah
Greg Prentiss: it’s so true tho
Jason Anderson: money vs women it’s like an proportion equation
Greg Prentiss: emphasized “money vs women it’s like an proportion equation”
Greg Prentiss: the best is if you get rich then find an equally rich women so money isn’t in the equation
Greg Prentiss: the. ur double rich and double hot 
Jason Anderson: or you’re just already so hot that ppl don’t care about money bc they wanna be w you anyways lol
Greg Prentiss: okay true true 
Greg Prentiss: i think love is 50/50 like physical versus personality 
Greg Prentiss: like u can’t be in love with someone u literally do not find attractive 
Jason Anderson: i think attraction is mo

In [74]:
response = llm(prompt)

Llama.generate: prefix-match hit

llama_print_timings:        load time =   19058.58 ms
llama_print_timings:      sample time =       2.09 ms /    24 runs   (    0.09 ms per token, 11477.76 tokens per second)
llama_print_timings: prompt eval time =  131009.09 ms /  3804 tokens (   34.44 ms per token,    29.04 tokens per second)
llama_print_timings:        eval time =    2761.58 ms /    23 runs   (  120.07 ms per token,     8.33 tokens per second)
llama_print_timings:       total time =  133807.86 ms


In [75]:
response

{'id': 'cmpl-3d734f35-8c89-47fe-8090-257c1d9e6edc',
 'object': 'text_completion',
 'created': 1705789801,
 'model': '/Users/connorparish/external_code/llama.cpp/models/llama-2-13b.Q5_K_M.gguf',
 'choices': [{'text': ' \n    \n    **I would suggest an INTJ, or at least a very high N.**\n',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 3805,
  'completion_tokens': 23,
  'total_tokens': 3828}}