In [1]:
# First, check GPU availability and install required packages
!nvidia-smi
!pip install transformers torch scipy sentencepiece accelerate
!pip install nvidia-pyindex
!pip install nvidia-tensorrt



Thu Oct 24 08:38:52 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time

def check_gpu_setup():
    """Verify CUDA availability and print GPU info"""
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"Current device: {torch.cuda.get_device_name()}")
        print(f"Device capability: {torch.cuda.get_device_capability()}")
        print(f"Memory allocated: {torch.cuda.memory_allocated(0)/1e9:.2f} GB")
        print(f"Memory cached: {torch.cuda.memory_reserved(0)/1e9:.2f} GB")

def load_model():
    """Load a model optimized for NVIDIA GPUs"""
    model_name = "microsoft/phi-2"  # Using Phi-2 as an example

    # Load model with NVIDIA optimizations
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,  # Use FP16 for efficiency
        device_map="auto"  # Automatically handle device placement
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

def benchmark_inference(model, tokenizer, text, num_iterations=5):
    """Benchmark inference performance"""
    input_ids = tokenizer(text, return_tensors="pt").input_ids.cuda()

    # Warmup
    with torch.no_grad():
        output = model.generate(input_ids, max_length=100)

    # Benchmark
    times = []
    with torch.no_grad():
        for _ in range(num_iterations):
            start_time = time.time()
            output = model.generate(input_ids, max_length=100)
            times.append(time.time() - start_time)

    avg_time = sum(times) / len(times)
    print(f"Average inference time: {avg_time:.2f} seconds")
    return tokenizer.decode(output[0])



In [3]:
# 1. Check GPU setup
print("=== Checking GPU Setup ===")
check_gpu_setup()

# 2. Load model
print("\n=== Loading Model ===")
model, tokenizer = load_model()

# 3. Run inference benchmark
print("\n=== Running Inference Benchmark ===")
sample_text = "Explain the benefits of using NVIDIA GPUs for AI:"
result = benchmark_inference(model, tokenizer, sample_text)

print("\n=== Generated Text ===")
print(result)

# 4. Memory cleanup
torch.cuda.empty_cache()

=== Checking GPU Setup ===
CUDA available: True
Current device: Tesla T4
Device capability: (7, 5)
Memory allocated: 0.00 GB
Memory cached: 0.00 GB

=== Loading Model ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



=== Running Inference Benchmark ===


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Average inference time: 2.84 seconds

=== Generated Text ===
Explain the benefits of using NVIDIA GPUs for AI:

NVIDIA GPUs are specifically designed for AI and machine learning tasks. They have a large number of processing cores and a high number of floating-point operations per second (FLOPS), which allows them to perform complex calculations quickly and efficiently. This makes them ideal for training deep learning models, which require a lot of computational power.

Step 2: Discuss the advantages of using NVIDIA GPUs for training deep learning models:

One of the
