In [None]:
 !pip install dstack

Collecting dstack
  Downloading dstack-0.19.11-py3-none-any.whl.metadata (20 kB)
Collecting argcomplete>=3.5.0 (from dstack)
  Downloading argcomplete-3.6.2-py3-none-any.whl.metadata (16 kB)
Collecting cursor (from dstack)
  Downloading cursor-1.3.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gpuhunt==0.1.6 (from dstack)
  Downloading gpuhunt-0.1.6-py3-none-any.whl.metadata (3.9 kB)
Collecting paramiko>=3.2.0 (from dstack)
  Downloading paramiko-3.5.1-py3-none-any.whl.metadata (4.6 kB)
Collecting pydantic-duality>=1.2.4 (from dstack)
  Downloading pydantic_duality-2.0.2-py3-none-any.whl.metadata (4.1 kB)
Collecting pydantic<2.0.0,>=1.10.10 (from dstack)
  Downloading pydantic-1.10.22-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting python-multipart>=0.0.16 (from dstack)
  Downloading python_mul

In [4]:
from transformers import GPT2Tokenizer, GPT2Model
import torch
import pandas as pd
import time
import os
import psutil
from transformers import AutoModelForCausalLM, AutoTokenizer

# === Settings ===
model_name = "microsoft/phi-1_5"
results_file = "runtime_results_phi_1_5.xlsx"

device_type = "cuda" if torch.cuda.is_available() else "cpu"
batch_sizes = [2, 4, 8, 16]

# === Load the model ===
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device_type)
model.eval()
config = model.config

# === Models features ===
num_layers = config.num_hidden_layers
hidden_size = config.hidden_size
vocab_size = config.vocab_size
activation_function = getattr(config, 'hidden_act', 'N/A')
max_position_embeddings = config.max_position_embeddings
num_heads = config.num_attention_heads
num_parameters = sum(p.numel() for p in model.parameters())
params_per_layer = num_parameters / num_layers
hidden_per_head = hidden_size / num_heads

# === Hardware features ===
num_cores = psutil.cpu_count(logical=True)

if device_type == "cuda":
    gpu_props = torch.cuda.get_device_properties(0)
    gpu_name = gpu_props.name
    gpu_memory = gpu_props.total_memory
else:
    gpu_name = "N/A"
    gpu_memory = 0

# === Prompts read ===
with open("prompts.txt", "r", encoding="utf-8") as f:
    prompts = [line.strip() for line in f if line.strip()]

# === Preprocessing ===
all_data = []

for prompt in prompts:
    encoded = tokenizer(prompt, return_tensors="pt")
    input_ids = encoded["input_ids"]
    sequence_length = input_ids.shape[1]

    for batch_size in batch_sizes:
        batch_input = input_ids.expand(batch_size, -1).to(device_type)

        if device_type == "cuda":
            torch.cuda.synchronize()
        start = time.time()
        with torch.no_grad():
            _ = model(batch_input)
        if device_type == "cuda":
            torch.cuda.synchronize()
        runtime_sec = time.time() - start

        data = {
            'model_name': model_name,
            'parameter_count': num_parameters,
            'num_layers': num_layers,
            'hidden_size': hidden_size,
            'sequence_length': sequence_length,
            'vocab_size': vocab_size,
            'max_position_embeddings': max_position_embeddings,
            'activation_function': activation_function,
            'model_type': 'gpt2',
            'params_per_layer': round(params_per_layer, 2),
            'hidden_per_head': round(hidden_per_head, 2),
            'prompt': prompt,
            'batch_size': batch_size,
            'device': device_type,
            'gpu_name': gpu_name,
            'gpu_memory_MB': gpu_memory // (1024 * 1024),
            'cpu_cores': num_cores,
            'runtime_sec': round(runtime_sec, 4),
        }

        all_data.append(data)
        print(f"Batch {batch_size} | Prompt '{prompt[:30]}...' done in {runtime_sec:.4f} sec")

# === Save Excel ===
df_new = pd.DataFrame(all_data)

if os.path.exists(results_file):
    df_old = pd.read_excel(results_file)
    df_all = pd.concat([df_old, df_new], ignore_index=True)
else:
    df_all = df_new

df_all.to_excel(results_file, index=False)
print("\n Saved to", results_file)


tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Batch 2 | Prompt 'Hello, how are you today?...' done in 16.0444 sec
Batch 4 | Prompt 'Hello, how are you today?...' done in 11.4053 sec
Batch 8 | Prompt 'Hello, how are you today?...' done in 14.8937 sec
Batch 16 | Prompt 'Hello, how are you today?...' done in 30.1641 sec
Batch 2 | Prompt 'Explain quantum physics in sim...' done in 4.8255 sec
Batch 4 | Prompt 'Explain quantum physics in sim...' done in 8.1701 sec
Batch 8 | Prompt 'Explain quantum physics in sim...' done in 17.5757 sec
Batch 16 | Prompt 'Explain quantum physics in sim...' done in 34.4607 sec
Batch 2 | Prompt 'Translate the following senten...' done in 7.4200 sec
Batch 4 | Prompt 'Translate the following senten...' done in 14.0040 sec
Batch 8 | Prompt 'Translate the following senten...' done in 27.8246 sec
Batch 16 | Prompt 'Translate the following senten...' done in 57.4280 sec
Batch 2 | Prompt 'What are the benefits of medit...' done in 5.7765 sec
Batch 4 | Prompt 'What are the benefits of medit...' done in 10.8298 sec

In [None]:
!dstack project add --name Fans-a-blazing --url https://sky.dstack.ai --token 15ec28bd-46bb-4cf0-8d45-c55a055b007d

In [None]:
!git clone https://github.com/digispect-intel/lyceum_fans_a_blazing.git

Cloning into 'lyceum_fans_a_blazing'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 30 (delta 8), reused 26 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (30/30), 226.26 KiB | 1.20 MiB/s, done.
Resolving deltas: 100% (8/8), done.


In [None]:
!cd lyceum_fans_a_blazing

In [None]:
!dstack init

OK


In [None]:
!ls ..

bin			    kaggle		      opt		 sys
boot			    lib			      proc		 tmp
content			    lib32		      python-apt	 tools
cuda-keyring_1.1-1_all.deb  lib64		      python-apt.tar.xz  usr
datalab			    libx32		      root		 var
dev			    media		      run
etc			    mnt			      sbin
home			    NGC-DL-CONTAINER-LICENSE  srv


In [None]:
!cd lyceum_fans_a_blazing/ && git pull && dstack init && dstack apply -yf models/text_generation/gpt2/gpt2-cpu_2-mem_8.dstack.yml

Already up to date.
OK
[2K[32m⠧[0m Getting apply plan...
[1A[2K [1mProject[0m          Fans-a-blazing                                          
 [1mUser[0m             lizabespalova                                           
 [1mConfiguration[0m    models/text_generation/gpt2/gpt2-cpu_2-mem_8.dstack.yml 
 [1mType[0m             task                                                    
 [1mResources[0m        cpu=2.. mem=8GB.. disk=50GB                             
 [1mSpot policy[0m      on-demand                                               
 [1mMax price[0m        -                                                       
 [1mRetry policy[0m     -                                                       
 [1mCreation policy[0m  reuse-or-create                                         
 [1mIdle duration[0m    5m                                                      
 [1mMax duration[0m     -                                                       
 [1mReservation[0