In [1]:
import os
os.environ["TRANSFORMERS_NO_SAFE_LOAD_WARNING"] = "1"


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


In [3]:
# Load model and tokenizer
model_name = "microsoft/DialoGPT-medium"


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # 🔥 Use GPU



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [6]:
# save this model in local
model.save_pretrained("./DialoGPT-medium")
tokenizer.save_pretrained("./DialoGPT-medium")


('./DialoGPT-medium/tokenizer_config.json',
 './DialoGPT-medium/special_tokens_map.json',
 './DialoGPT-medium/vocab.json',
 './DialoGPT-medium/merges.txt',
 './DialoGPT-medium/added_tokens.json',
 './DialoGPT-medium/tokenizer.json')

In [7]:
# user model from local
model = AutoModelForCausalLM.from_pretrained("./DialoGPT-medium")
tokenizer = AutoTokenizer.from_pretrained("./DialoGPT-medium")


In [8]:
import tensorflow as tf
print("TensorFlow Version:", tf.__version__)
print("GPU Available:", tf.config.list_physical_devices('GPU'))


2025-06-23 06:09:49.665401: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow Version: 2.10.0
GPU Available: []


In [9]:
# Set pad_token_id explicitly if it's None or conflicts
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [10]:
def generate_response(prompt, max_length=100):
    # Append special separator for conversational context
    new_user_input_ids = tokenizer.encode(prompt + tokenizer.eos_token, return_tensors='pt')

    # If no past context, start new chat
    chat_history_ids = None

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=new_user_input_ids if chat_history_ids is None else torch.cat([chat_history_ids, new_user_input_ids], dim=-1),
            max_length=max_length,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            num_return_sequences=1
        )

    response = tokenizer.decode(output_ids[:, new_user_input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response.strip()


In [11]:
print(generate_response("What is the capital of France?"))

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Malta. We are the capital of the Maltese Islands.


In [12]:
# Before inference
print("🟡 Allocated:", round(torch.cuda.memory_allocated(0)/1024**2, 2), "MB")
print("🟣 Cached:", round(torch.cuda.memory_reserved(0)/1024**2, 2), "MB")

# After inference
torch.cuda.synchronize()  # Ensure all ops complete
print("✅ Done. GPU Utilization logged above.")


🟡 Allocated: 1377.55 MB
🟣 Cached: 1380.0 MB
✅ Done. GPU Utilization logged above.


In [13]:
print(torch.cuda.is_available())                    # → True?
print(torch.cuda.get_device_name(0))                # → Should show "GeForce 940MX"
print(next(model.parameters()).device)              # → Should be 'cuda:0'


True
NVIDIA GeForce 940MX
cpu
