## Imports

In [1]:
import gc
import logging
import queue
import random
import subprocess
import time
from datetime import datetime
from importlib import reload
from multiprocessing import Process, Manager

import matplotlib.pyplot as plt
import numpy as np
import torch
from PIL import Image
from torch.nn import Module
from torchvision import models, transforms

reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO, datefmt='%I:%M:%S')
print("Starting...")

## Async Processes

In [2]:
class GPUMonitor(Process):
    def __init__(self, delay):
        super(GPUMonitor, self).__init__()
        self.delay = delay
        self.power_readings = Manager().list()
        self.utilization_readings = Manager().list()
        self.running = True
        self.command = 'nvidia-smi --query-gpu=power.draw,utilization.gpu, --format=csv,noheader,nounits'.split(' ')
        self.start()

    def run(self):
        while self.running:
            try:
                output_bytes = subprocess.check_output(self.command).strip()
                output_string = output_bytes.decode('utf-8')
                gpu_power, gpu_utilization = output_string.split(',')
                self.power_readings.append(float(gpu_power.strip()))
                self.utilization_readings.append(float(gpu_utilization.strip()))
            except:
                logging.error('Something went wrong while retrieving GPU readings...')
            time.sleep(self.delay)

    def reset_energy(self):
        self.power_readings[:] = []
        self.utilization_readings[:] = []

    def get_power_average(self):
        return np.mean(self.power_readings)

    def get_utilization_average(self):
        return np.mean(self.utilization_readings)

    def plot_power(self):
        plt.title("Power")
        plt.plot(self.power_readings)
        plt.show()

    def plot_utilization(self):
        plt.title("Utilization")
        plt.plot(self.utilization_readings)
        plt.show()


class RequestQueue(Process):
    def __init__(self, id, frequency, nr_of_requests):
        super(Process, self).__init__()
        self.id = id
        self.frequency = frequency
        self.nr_of_requests = nr_of_requests
        self.queue = Manager().Queue(nr_of_requests)
        self.total_time_in_queue = Manager().Value(float, 0.0)
        self.max_wait_time_in_queue = Manager().Value(float, 0.0)
        self.batch_start_times = Manager().list()
        self.start()

    def run(self):
        logging.info("Started simulation with id: {}".format(self.id))
        while self.nr_of_requests > 0:
            self.queue.put((random.choice(['img/dog.jpg', 'img/bald_eagle.jpg', 'img/strawberries.jpg']), time.perf_counter()))
            self.nr_of_requests -= 1
            time.sleep(1 / self.frequency * random.uniform(0.8, 1.2))

    def get_request(self, block=True, timeout=None):
        img, t_0 = self.queue.get(block, timeout)
        self.batch_start_times.append(t_0)
        return img

    def update_wait_times(self):
        curr_time = time.perf_counter()
        for img_t0 in self.batch_start_times:
            time_in_queue = curr_time - img_t0
            self.total_time_in_queue.value += time_in_queue
            self.max_wait_time_in_queue.value = max(self.max_wait_time_in_queue.value, time_in_queue)
        self.batch_start_times[:] = []



## Initialisation

In [3]:

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )])
with open('image_net_classes.txt') as file:
    classes = [line.strip().split(', ')[1] for line in file.readlines()]

## Inference

In [4]:
def infer(model: Module, images, use_gpu=True, verbose=False):
    model.eval()
    with torch.no_grad():
        if use_gpu:
            model.cuda()
        images_t = [transform(im) for im in images]
        batch = torch.cat([tensor for tensor in [torch.unsqueeze(im_t, 0) for im_t in images_t]])
        if use_gpu:
            out = model(batch.cuda())
        else:
            out = model(batch)
    if verbose:
        for prediction in out:
            prediction = prediction.cpu()
            _, indices = torch.sort(prediction, descending=True)
            percentages = [(torch.nn.functional.softmax(prediction, dim=0)[class_index] * 100).item() for class_index in
                           indices[:5]]

            logging.info(f'Rank\tInferred class\tProbability(%)')
            for idx, class_index in enumerate(indices[:5]):
                logging.info(f'#{idx}\t\t{classes[class_index]}\t{percentages[idx]}')
            logging.info('-----------------------------------------')


def run_experiment(model_, input_images_):
    t_0 = time.perf_counter()
    infer(model_, input_images_, use_gpu=True)
    return time.perf_counter() - t_0

## Write Results

In [5]:
def write_results(_file_name, _batch_size, _average, _average_util, _duration, _wait_time, _max_wait_time, _peak_average, _batch_average, _total_time, _inference_time):
    file = open(f'results/{_file_name}', 'a')
    logging.info(f'Batch Size\tAverage Power(W)\t\tTime(s)\t\t\t\tEnergy(J)\t\t\tAverage Wait Time(s)\tMax Wait Time(s)\tAverage Peak Power (W)\tTotal time per image\tInference time per image')
    logging.info(f'{_batch_size if _batch_size > 0 else f"Greedy ({_batch_average})"}\t\t\t{_average}\t\t{_duration}\t{_average * _duration}\t{_wait_time}\t\t{_max_wait_time}\t\t{_peak_average}')
    file.write(f'{_batch_size if _batch_size > 0 else f"Greedy ({_batch_average})"},{_average},{_average_util},{_duration},{_average * _duration},{_wait_time},{_max_wait_time},{_peak_average},{_total_time},{_inference_time}\n')
    file.close()
    logging.info(f'Results logged to: results/{file_name}')

## Configuration

In [6]:
batch_sizes = [16, 32, 64, 128,-1]
frequency = 16 # 16, 32, 64, 128
nr_of_requests = 8192
models = [models.alexnet(pretrained=True), models.densenet121(pretrained=True), models.shufflenet_v2_x0_5(pretrained=True), models.vit_b_16(pretrained=True), models.convnext_base(pretrained=True), models.resnet50(pretrained=True), models.mobilenet_v2(pretrained=True), models.efficientnet_b7(pretrained=True)]


## Simulation

In [None]:
# WARMUP
for _ in range(256):
    batch = [Image.open('img/dog.jpg') for _ in range(32)]
    run_experiment(models[0], batch)
###

for model in models:
    file_name = f'{model.__class__.__name__}_f{frequency}'
    file = open(f'results/{file_name}', 'a')
    file.write(f'Batch Size,Average Power(W), Average Utilization(%),Time(s),Energy(J),Average Wait Time(s),Max Wait Time(s),Average Peak Power (W), Total time per image, Inference time per image\n')
    file.close()
    for batch_size in batch_sizes:
        torch.cuda.empty_cache()
        gc.collect()
        epsilon = 2 / frequency + 1
        gpu_monitor = GPUMonitor(0.01)
        img_count = 0
        batches = []

        rq = RequestQueue(f'inference_simulation_f{frequency}_{batch_size}', frequency, nr_of_requests)
        t_0 = time.perf_counter()
        while True:
            try:
                batch = [Image.open(rq.get_request(block=True, timeout=epsilon)) for _ in
                         range(max(min(rq.queue.qsize(), 128), 1) if batch_size <= 0 else batch_size)]
                t = run_experiment(model, batch)
                rq.update_wait_times()
                img_count += len(batch)
                batches.append(len(batch))
                # logging.info(f"{100 * img_count / nr_of_requests}% last batch ({len(batch)}) took {t}s")
            except queue.Empty:
                break

        power = gpu_monitor.power_readings[:]
        average = np.mean(power)
        utilization = gpu_monitor.utilization_readings[:]
        average_utilization = np.mean(utilization)
        duration = time.perf_counter() - t_0 - epsilon # shouldn't it be placed right after the while loop?
        inference_time = duration / nr_of_requests
        wait_time = rq.total_time_in_queue.value / nr_of_requests
        total_time = inference_time + wait_time
        max_wait_time = rq.max_wait_time_in_queue.value
        peak_average = np.mean(list(filter(lambda p: p > 65, power)))
        average_batch_size = np.mean(batches)
        gpu_monitor.plot_power()
        gpu_monitor.plot_utilization()
        write_results(file_name, batch_size, average, average_utilization, duration, wait_time, max_wait_time, peak_average, average_batch_size, total_time, inference_time)
        rq.terminate()
        gpu_monitor.terminate()