In [1]:
# If running in Google Colab, install required packages
import sys
if 'google.colab' in sys.modules:
    !pip install transformers torch sentencepiece accelerate --quiet

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [3]:
# Default model name
DEFAULT_MODEL_NAME = "deepseek-ai/deepseek-llm-7b-chat"

In [4]:
# Cache for loaded models and tokenizers
_model_cache = {}

def get_model_and_tokenizer(model_name):
    if model_name not in _model_cache:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name, torch_dtype=torch.float16, device_map="auto"
        )
        _model_cache[model_name] = (model, tokenizer)
    return _model_cache[model_name]

In [None]:
def generate_response(prompt, model_name=DEFAULT_MODEL_NAME, max_new_tokens=1024, temperature=0.7, top_p=0.9):
    """
    Generate a response from a language model given a prompt.

    Args:
        prompt (str): The input prompt.
        model_name (str): The HuggingFace model name. Defaults to DEFAULT_MODEL_NAME.
        max_new_tokens (int): Maximum number of new tokens to generate.
        temperature (float): Sampling temperature. Higher values mean more random generations. Default is 0.7.
        top_p (float): Nucleus sampling probability. Lower values mean more focused generations. Default is 0.9.

    Returns:
        str: The generated response.
    """
    model, tokenizer = get_model_and_tokenizer(model_name)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [6]:
# Example usage
example_prompt = "How are w2vec and BERT different?"
print(generate_response(example_prompt))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


How are w2vec and BERT different?
BERT is a pre-trained transformer model for natural language processing (NLP) tasks. It is based on the transformer architecture and is trained on a large corpus of text data. It can be fine-tuned for specific tasks such as text classification, question answering, and sentiment analysis.

W2VEC, on the other hand, is a word embedding model that converts words into dense vectors. It is a type of word representation that captures the semantic and syntactic relationships between words. W2VEC is trained on a large corpus of text data and is used as a starting point for downstream NLP tasks.

In summary, BERT is a pre-trained model for NLP tasks and W2VEC is a word embedding model that captures the semantic and syntactic relationships between words.
