In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

In [3]:

# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./books', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [4]:
len(documents)

1338

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000, chunk_overlap = 200)
docs = text_splitter.split_documents(documents)

len(docs)

2289

In [8]:
docs[2200]

Document(page_content='(2008): 740–53.\nsettle our differences\n: Barbara Mellers, Ralph Hertwig, and Daniel\nKahneman, “Do Frequency Representations Eliminate Conjunction\nEffects? An Exercise in Adversarial Collaboration,” \nPsychological\nScience\n 12 (2001): 269–75.\n16: Causes Trump Statistics\ncorrect answer is 41%\n: Applying Bayes’s rule in odds form, the prior odds\nare the odds for the Blue cab from the base rate, and the likelihood ratio is\nthe ratio of the probability of the witness saying the cab is Blue if it is Blue,\ndivided by the probability of the witness saying the cab is Blue if it is\nGreen: posterior odds = (.15/.85) × (.80/.20) = .706. The odds are the ratio\nof the probability that the cab is Blue, divided by the probability that the cab\nis Green. To obtain the probability that the cab is Blue, we compute:\nProbability (Blue) = .706/1. 706 = .41. The probability that the cab is Blue\nis 41%.\nnot too far from the Bayesian\n: Amos Tversky and Daniel Kahneman,\

In [9]:
#pip install bitsandbytes accelerate transformers
#pip install -U FlagEmbedding
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en"

embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
)


  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [10]:
check = embeddings.embed_query("how are you?")
print(len(check))

768


Initializing Pinecone


In [11]:
#pip install pinecone-client
import pinecone

In [13]:
# PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
# PINECONE_ENV = os.environ.get('PINECONE_ENV')
PINECONE_API_KEY = "29215c49-3fa5-488d-86fb-64cbc6c98a8d"
PINECONE_ENV = "us-west4-gcp-free"

In [14]:
pinecone.init(
    api_key=PINECONE_API_KEY,  
    environment=PINECONE_ENV
)

In [12]:
index_name = "source-embed"

In [17]:
from langchain.vectorstores import Pinecone

In [None]:
text_embed = Pinecone.from_texts ([t.page_content for t in docs], embeddings, index_name = index_name)

In [18]:
#If you already have an index, you can load it like this

#text_embed = Pinecone.from_existing_index(index_name, embeddings)

In [19]:
query = "who is woodell?"
query_similar = text_embed.similarity_search(query,k= 4)
query_similar

[Document(page_content='would\talways\thave\ta\tbig\tbelly\tlaugh\tabout\tthe\twhole\tdebacle.\tMost\tnights\nwe’d\twind\tup\tat\tsome\tdive\tbar,\tgiddy,\talmost\tdelirious.\tBefore\tparting\twe’d\noften\tplay\ta\tgame.\tI’d\tbring\tout\ta\tstopwatch\tand\twe’d\tsee\thow\tfast\tWoodell\ncould\tfold\tup\this\twheelchair\tand\tget\tit\tand\thimself\tinto\this\tcar.\tAs\ta\tformer', metadata={}),
 Document(page_content='track\tstar,\the\tloved\tthe\tchallenge\tof\ta\tstopwatch,\tof\ttrying\tto\tbeat\this\npersonal\tbest.\t(His\trecord\twas\tforty-four\tseconds.)\tWe\tboth\tcherished\tthose\nnights,\tthe\tsilliness,\tthe\tsense\tof\tshared\tmission,\tand\twe\tmutually\tranked\nthem\tamong\tthe\tsolid\tgold\tmemories\tof\tour\tyoung\tlives.\nWoodell\tand\tI\twere\tvery\tdifferent,\tand\tyet\tour\tfriendship\twas\tbased\ton\ta\nselfsame\tapproach\tto\twork.\tEach\tof\tus\tfound\tpleasure,\twhenever\tpossible,\tin\nfocusing\ton\tone\tsmall\ttask.\tOne\ttask,\twe\toften\tsaid,\tclears\tthe\tm

In [20]:
from decouple import config
import together
from langchain import PromptTemplate, LLMChain
from typing import Any, Dict, List, Mapping, Optional
#pydantic model version>2 will not work with langchain. you should install a model version less than 2
from pydantic import Extra, Field, root_validator

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from langchain.llms.utils import enforce_stop_tokens
from langchain.utils import get_from_dict_or_env
import re

together.api_key = config("TOGETHER_API_KEY")

together.Models.start("togethercomputer/llama-2-70b-chat")

class TogetherLLM(LLM):
    """Together large language models."""

    model = "togethercomputer/llama-2-70b-chat"
    """model endpoint to use"""

    together_api_key = config("TOGETHER_API_KEY")
    """Together API key"""

    temperature = 0.7
    """What sampling temperature to use."""

    max_tokens = 512
    """The maximum number of tokens to generate in the completion."""

    class Config:
        extra = Extra.forbid

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that the API key is set."""
        api_key = get_from_dict_or_env(
            values, "together_api_key", "TOGETHER_API_KEY"
        )
        values["together_api_key"] = api_key
        return values

    @property
    def _llm_type(self):
        """Return type of LLM."""
        return "together"

    def _call(
        self,
        prompt,
        **kwargs: Any,
    ):
        """Call to Together endpoint."""
        together.api_key = self.together_api_key
        output = together.Complete.create(prompt,
                                          model=self.model,
                                          max_tokens=self.max_tokens,
                                          temperature=self.temperature,
                                          )
        text = output['output']['choices'][0]['text']
        # Use regex substitution to remove newlines
        #cleaned_text = re.sub(r"\n", "", text)
        return text

tog_llm = TogetherLLM(
    model= "togethercomputer/llama-2-70b-chat",
    temperature=0.1,
    max_tokens=1024)

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

In [21]:

def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [22]:
sys_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. """
instruction = """CONTEXT:/n/n {context}/n
Question: {question}"""
get_prompt(instruction, sys_prompt)

"[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. \n<</SYS>>\n\nCONTEXT:/n/n {context}/n\nQuestion: {question}[/INST]"

In [24]:
from langchain.prompts import PromptTemplate
my_template = get_prompt(instruction, sys_prompt)

llama_prompt = PromptTemplate(
    template=my_template, input_variables=["context", "question"])

In [25]:
chain_type_kwargs = {"prompt":llama_prompt}

In [26]:
from langchain.schema import prompt
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=tog_llm,
                                       chain_type="stuff",
                                       retriever=text_embed.as_retriever(search_kwargs={"k": 5}),
                                       chain_type_kwargs=chain_type_kwargs,
                                       return_source_documents=True)


In [27]:
## text clean up

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))

In [29]:
query = "Who is woodell?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Woodell is a friend of the narrator's who is a paraplegic. He is a former track star who was injured in a car accident and is now confined to a wheelchair. Despite his disability, Woodell is a positive and energetic person who is passionate about his work and enjoys spending time with the narrator and their mutual friends.


In [30]:
query = "In which book the author discussed about woodell?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The author discussed Woodell in the book "Shoe Dog" by Phil Knight.


In [31]:
query = "Whose writing style and narrative voice did you find the most compelling? Who structured their book in the most engaging way?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Based on the provided context, it seems that Malcolm Gladwell's writing style and narrative voice have been widely praised for their engaging and thought-provoking qualities. Many of the reviewers mentioned that Gladwell's use of anecdotes and interviews to illustrate his theories on human potential and success is particularly effective in capturing the reader's attention and making complex ideas more accessible.

In terms of structuring the book in the most engaging way, it's worth noting that Gladwell's books, including "Outliers," are often praised for their clear and direct prose, which makes the reading experience both enjoyable and thought-provoking. The book's structure, which weaves together anecdotes, interviews, and scientific research, has been described as "masterful" and "impeccable."

Therefore, based on the provided context, it seems that Malcolm Gladwell's writing style and narrative voice are the most compelling, and his book "Outliers" is structured in the most engag

In [32]:
query = "What anecdotes among the 4 books revealed the most intriguing habits or mindsets of great leaders? Which were most reflective of their leadership qualities?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The anecdotes in Malcolm Gladwell's books reveal intriguing habits and mindsets of great leaders. In "Outliers," the story of the Beatles' rise to fame highlights the importance of dedication and practice in achieving success. The anecdote about the Beatles' performance at the Cavern Club, where they honed their craft and developed their unique sound, demonstrates the power of deliberate practice in achieving mastery. This anecdote reflects the leadership quality of persistence and commitment to excellence.

Another anecdote in "Outliers" that reveals a key habit of successful leaders is the story of the Rothschild family's success in banking. The anecdote highlights the importance of taking calculated risks and being adaptable in response to changing circumstances. This anecdote reflects the leadership quality of being proactive and taking calculated risks to achieve success.

In "Blink," the anecdote about the psychologist Daniel Kahneman's work on cognitive biases highlights the im