In [26]:
# !pip install peft InstructorEmbedding

In [1]:
import os
import torch
import nest_asyncio
from dotenv import load_dotenv
from typing import Any, List, Mapping, Optional

from tqdm import tqdm

from transformers import pipeline, TextStreamer, LlamaTokenizer, LlamaForCausalLM
from peft import PeftModel

from llama_index import download_loader, SummaryPrompt, LLMPredictor, GithubRepositoryReader, GPTVectorStoreIndex, GPTTreeIndex, GPTListIndex, PromptHelper, SimpleDirectoryReader, load_index_from_storage, StorageContext, ServiceContext, LangchainEmbedding
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index.node_parser import SimpleNodeParser, NodeParser

from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings, HuggingFaceInstructEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.llms.base import LLM
from langchain.text_splitter import CharacterTextSplitter

load_dotenv()
nest_asyncio.apply()

  warn(


CUDA SETUP: CUDA path found: /usr/local/cuda/lib64/libcudart.so
CUDA_SETUP: Detected CUDA version 118
CUDA_SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


In [2]:
# define prompt helper
# set maximum input size
max_input_size = 1024
# set number of output tokens
num_output = 256
# set maximum chunk overlap
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

## Define LLM

In [3]:
api = "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"

tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
streamer = TextStreamer(tokenizer, skip_prompt=True, Timeout=5)

import huggingface_hub as hf_hub

hf_hub.login(token=api)

## loading llama base model and configuring it with adapter

base_model_name = 'decapoda-research/llama-7b-hf'

base_model = LlamaForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.float16,
            device_map="auto",
        )

model = PeftModel.from_pretrained(
            base_model,
            'shrinath-suresh/alpaca-lora-7b-answer-summary',
#             'shrinath-suresh/alpaca-lora-all-7b-delta',
            torch_dtype=torch.float16,
            load_in_8bit=True
        )
class CustomLLM(LLM):
    model_name = 'shrinath-suresh/alpaca-lora-7b-answer-summary'
    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        inputs = tokenizer([prompt], return_tensors="pt")

        # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
        response = model.generate(**inputs, streamer=streamer, top_p=0.75, max_new_tokens=num_output)
        response = tokenizer.decode(response[0])
        return response

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {"name_of_model": self.model_name}

    @property
    def _llm_type(self) -> str:
        return "custom"

llm_predictor = LLMPredictor(llm=CustomLLM())

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

## Define Embedding model

In [4]:
# embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
embed_model = LangchainEmbedding(HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl"))

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: hkunlp/instructor-xl


load INSTRUCTOR_Transformer


INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda


max_seq_length  512


In [5]:
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model=embed_model)

## Text splitter

In [6]:
text_splitter = TokenTextSplitter(chunk_size=1024, chunk_overlap=200)
parser = SimpleNodeParser(text_splitter=text_splitter)

## Read Docs

In [10]:
# PyTorch Docs
input_files = []
for path, subdirs, files in os.walk('/home/ubuntu/text'):
    for name in files:
        file = os.path.join(path, name)
        input_files.append(file)
docs = SimpleDirectoryReader(input_dir="/home/ubuntu/text", recursive=True, file_extractor={".txt": MarkdownReader()}, file_metadata=set_metadata).load_data()
nodes = parser.get_nodes_from_documents(docs)

In [11]:
index = GPTVectorStoreIndex.from_documents(nodes, service_context=service_context)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 6043 tokens


In [12]:
index.storage_context.persist('./pytorch_docs_1024')

In [13]:
query_engine = index.as_query_engine(service_context=service_context)

In [16]:
queries = ["How to load a model from torch hub?"]

In [17]:
for query in queries:
    response = query_engine.query(query)
    print(response)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 9 tokens



The answer is:

```
import torch
import torch.hub

# Load a model from torch hub
model = torch.hub.load(url='https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')

# Run the model
model(torch.randn(1, 3, 224, 224))
```



### Example

```
import torch
import torch.hub

# Load a model from torch hub
model = torch.hub.load(url='https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')

# Run the model
model(torch.randn(1, 3, 224, 224))
```



### Output

```
Loaded model from https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1251 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


 ⁇  Context information is below. 
---------------------
*Any*]

   -[ Example ]-

   >>> state_dict = torch.hub.load_state_dict_from_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')


Running a loaded model:
-----------------------

Note that "*args" and "**kwargs" in "torch.hub.load()" are used to
**instantiate** a model. After you have loaded a model, how can you
find out what you can do with the model? A suggested workflow is

* "dir(model)" to see all available methods of the model.

* "help(model.foo)" to check what arguments "model.foo" takes to run

To help users explore without referring to documentation back and
forth, we strongly recommend repo owners make function help messages
clear and succinct. It's also helpful to include a minimal working
example.


Where are my downloaded models saved?
-------------------------------------

The locations are used in the order

model.load_state_dict(torch.hub.load_state_dict_from_url(checkpoint, progress=False))


I