In [1]:
# If running in Google Colab, install required packages
import sys
if 'google.colab' in sys.modules:
    !pip install transformers torch sentencepiece accelerate --quiet

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [3]:
# Default model name
DEFAULT_MODEL_NAME = "deepseek-ai/deepseek-llm-7b-chat"

In [4]:
# Cache for loaded models and tokenizers
_model_cache = {}

def get_model_and_tokenizer(model_name):
    if model_name not in _model_cache:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name, torch_dtype=torch.float16, device_map="auto", offload_folder="./offload"
        )
        _model_cache[model_name] = (model, tokenizer)
    return _model_cache[model_name]

def generate_response(prompt, model_name=DEFAULT_MODEL_NAME, max_new_tokens=1024, temperature=0.7, top_p=0.9):
    """
    Generate a response from a language model given a prompt.

    Args:
        prompt (str): The input prompt.
        model_name (str): The HuggingFace model name. Defaults to DEFAULT_MODEL_NAME.
        max_new_tokens (int): Maximum number of new tokens to generate.
        temperature (float): Sampling temperature. Higher values mean more random generations. Default is 0.7.
        top_p (float): Nucleus sampling probability. Lower values mean more focused generations. Default is 0.9.

    Returns:
        str: The generated response.
    """
    model, tokenizer = get_model_and_tokenizer(model_name)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [5]:
# Example usage
example_prompt = "How are w2vec and BERT embeddings different?"
print(generate_response(example_prompt))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


How are w2vec and BERT embeddings different?
BERT stands for Bidirectional Encoder Representations from Transformers, which is a pre-trained deep learning model for natural language processing. It is capable of understanding the context of words in a sentence by analyzing both the left and right sides of the words simultaneously.

W2VEC stands for Word2Vec Embeddings, which is a pre-trained model for converting words into vectors that capture their semantic meanings.

The main difference between the two is their purpose and the way they represent words. W2VEC is a simpler model that represents words as vectors, without considering the context of words in a sentence. It only uses the left context (previous words) to generate the vector.

On the other hand, BERT is a more complex model that generates contextual word embeddings. It uses both the left and right contexts of words to generate the vector, thus providing a more comprehensive understanding of the word's meaning and context.

In