In [None]:
# If running in Google Colab, install required packages
import sys
if 'google.colab' in sys.modules:
    !pip install transformers torch --quiet

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [2]:
# Default model name
DEFAULT_MODEL_NAME = "deepseek-ai/deepseek-llm-7b-chat"

In [3]:
# Cache for loaded models and tokenizers
_model_cache = {}

def get_model_and_tokenizer(model_name):
    if model_name not in _model_cache:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name, torch_dtype=torch.float16, device_map="auto"
        )
        _model_cache[model_name] = (model, tokenizer)
    return _model_cache[model_name]

In [4]:
def generate_response(prompt, model_name=DEFAULT_MODEL_NAME, max_new_tokens=1000, temperature=0.7, top_p=0.9):
    """
    Generate a response from a language model given a prompt.

    Args:
        prompt (str): The input prompt.
        model_name (str): The HuggingFace model name. Defaults to DEFAULT_MODEL_NAME.
        max_new_tokens (int): Maximum number of new tokens to generate.
        temperature (float): Sampling temperature. Higher values mean more random generations. Default is 0.7.
        top_p (float): Nucleus sampling probability. Lower values mean more focused generations. Default is 0.9.

    Returns:
        str: The generated response.
    """
    model, tokenizer = get_model_and_tokenizer(model_name)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [5]:
# Example usage
example_prompt = "How are w2vec and BERT different?"
print(generate_response(example_prompt))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


How are w2vec and BERT different?
BERT (Bidirectional Encoder Representations from Transformers) is a deep learning-based language representation model that has revolutionized natural language processing (NLP). It uses a self-attention mechanism to understand the context of words in a sentence. On the other hand, W2VEC is a word embedding model that converts words into vectors, which can be used for various NLP tasks like sentiment analysis, text classification, etc.

So, the key differences between BERT and W2VEC are:

1. W2VEC:

- It is a word embedding model.
- It converts words into vectors.
- It is used for NLP tasks like sentiment analysis, text classification, etc.
- It uses a unidirectional approach to convert words into vectors.

2. BERT:

- It is a deep learning-based language representation model.
- It uses a self-attention mechanism to understand the context of words in a sentence.
- It is used for various NLP tasks like text classification, question-answering, etc.
- It us