In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
import faiss
import torch
import numpy as np
import requests


In [2]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
    # llm_int8_has_fp16_weight=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-7B-Instruct", device_map="cuda", quantization_config=quantization_config)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]

In [5]:
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

In [6]:
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

In [7]:
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)

In [8]:
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [9]:
print(response)

Sure! A large language model (LLM) is a type of artificial intelligence system designed to understand and generate human-like text. These models are typically based on deep learning techniques, particularly transformer architectures, which allow them to process and understand vast amounts of textual data.

LLMs are trained on massive datasets containing a wide variety of text from the internet, books, articles, and more. This extensive training enables them to learn complex patterns in language, allowing them to perform various natural language processing tasks such as:

- **Text generation**: Creating coherent paragraphs or conversations.
- **Translation**: Translating text from one language to another.
- **Summarization**: Generating concise summaries of longer texts.
- **Question answering**: Providing answers to questions based on given information.
- **Text classification**: Categorizing text into predefined categories.

Some well-known examples of LLMs include models like GPT (Ge

In [3]:
import numpy as np
import faiss
import os
from getpass import getpass

In [4]:
response = requests.get('https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt')
text = response.text

In [5]:
chunk_size = 2048
chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
len(chunks)

37

In [8]:
def get_text_embedding(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    last_hidden_state = outputs.last_hidden_state
    
    embedding = last_hidden_state.mean(dim=1).squeeze().numpy()
    
    return embedding


In [9]:
chunks = ["This is the first chunk.", "This is the second chunk."]
text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])

print(text_embeddings)

AttributeError: 'CausalLMOutputWithPast' object has no attribute 'last_hidden_state'