# Tracking memory leaks / memory fragmentation

This notebook is for finding out where fastai doesn't allocate GPU RAM efficiently. Feel free to add other sections. Currently it only does a basic training loop, with some unfreezing and inference.

The detection comes from reading the output of [IPyExperimentsPytorch](https://github.com/stas00/ipyexperiments/) per-cell reports.

In particular watch Delta Peak column which may indicate where more GPU RAM was allocated before freeing some, which may lead to smalish holes in allocated GPU RAM which can't be re-used and thus causing fragmentation and leading to less total available GPU RAM.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from fastai.vision import *

In [None]:
from pathlib import Path
import numpy as np
from ipyexperiments import IPyExperimentsPytorch
#! pip install ipyexperiments

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
assert str(device) == 'cuda:0', f"we want GPU, got {device}"

In [None]:
from IPython.display import Markdown, display
def alert(string, color='red'):
    display(Markdown(f"<span style='color:{color}'>**{string}**</span>"))

In [None]:
# x1 = consume_cpu(2**14) # about 1GB
def consume_gpu(n): return torch.ones((n, n)).cuda()
def consume_1gb(): return [consume_gpu(2**14)]
def consume_6gb(): return [consume_1gb() for x in range(6) ]

def reclaim():
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
import tracemalloc, threading, torch, time, pynvml
from fastai.utils.mem import *
from fastai.vision import *

if not torch.cuda.is_available(): raise Exception("pytorch is required")

def preload_pytorch():
    torch.ones((1, 1)).cuda()
    
def gpu_mem_get_used_no_cache():
    torch.cuda.empty_cache()
    return gpu_mem_get().used

def gpu_mem_used_get_fast(gpu_handle):
    info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
    return int(info.used/2**20)

def torch_mem_report():
    torch.cuda.empty_cache()
    print(list(map(lambda x: int(x/2**20), [torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated(), torch.cuda.memory_cached(), torch.cuda.max_memory_cached()])))
    
preload_pytorch()
pynvml.nvmlInit()

class PeakMemMetric(LearnerCallback):
    _order=-20 # Needs to run before the recorder

    def peak_monitor_start(self):
        self.peak_monitoring = True

        # start RAM tracing
        tracemalloc.start()

        # this thread samples RAM usage as long as the current epoch of the fit loop is running
        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
        peak_monitor_thread.daemon = True
        peak_monitor_thread.start()
        
    def peak_monitor_stop(self):
        tracemalloc.stop()
        self.peak_monitoring = False
        
    def peak_monitor_func(self):
        self.gpu_mem_used_peak = -1

        gpu_id = torch.cuda.current_device()
        gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)

        while True:
            gpu_mem_used = gpu_mem_used_get_fast(gpu_handle)
            self.gpu_mem_used_peak = max(gpu_mem_used, self.gpu_mem_used_peak)
            if not self.peak_monitoring: break
            time.sleep(0.001) # 1msec

    def on_train_begin(self, **kwargs):
        self.learn.recorder.add_metric_names(['cpu used',  'peak', 'gpu used',  'peak'])
                    
    def on_epoch_begin(self, **kwargs):
        self.peak_monitor_start()
        self.gpu_before = gpu_mem_get_used_no_cache()

    def on_epoch_end(self, **kwargs):
        cpu_current, cpu_peak =  list(map(lambda x: int(x/2**20), tracemalloc.get_traced_memory()))
        gpu_current = gpu_mem_get_used_no_cache() - self.gpu_before
        gpu_peak    = self.gpu_mem_used_peak      - self.gpu_before
        self.peak_monitor_stop()
        # The numbers are deltas in MBs (beginning of the epoch and the end)
        self.learn.recorder.add_metrics([cpu_current, cpu_peak, gpu_current, gpu_peak])

# Prep dataset

In [None]:
exp1 = IPyExperimentsPytorch()

In [None]:
path = untar_data(URLs.MNIST)

In [None]:
# setup
defaults.cmap='binary'
bs=512
tfms = ([*rand_pad(padding=3, size=28, mode='zeros')], [])
#arch="resnet34"
arch="resnet50"

In [None]:
il = ImageItemList.from_folder(path, convert_mode='L')
il

In [None]:
sd = il.split_by_folder(train='training', valid='testing')
sd

In [None]:
src = sd.label_from_folder()

In [None]:
ll = src.transform(tfms)

In [None]:
data = ll.databunch(bs=bs).normalize(imagenet_stats)
x,y = data.train_ds[0]
x.show()
print(y)

# Train and Validate


In [None]:
model = getattr(models, arch) # models.resnet34

In [None]:
learn = create_cnn(data, model, metrics=[accuracy], callback_fns=PeakMemMetric)

In [None]:
learn.lr_find()

In [None]:
# 2nd time to check for leaks
learn.lr_find()
# gpu delta consumed should be zero
# but why peaked is much smaller?

In [None]:
reclaim() # resets lr_find's GPU RAM consumption

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(1, max_lr=1e-2)

In [None]:
# cycle this:
learn.fit_one_cycle(1, max_lr=1e-2)
learn.fit_one_cycle(2, max_lr=1e-2)
learn.save(f'reload1')
_=learn.load(f'reload1')
reclaim() # resets lr_find's GPU RAM consumption

In [None]:
reclaim() # resets fit_one_cycle's GPU RAM consumption

In [None]:
learn.save(f'reload1')

### Proposed Change
end of `learn.load()` is a place where reclaim should be executed, so that the GPU RAM taken previously by the model is unloaded - otherwise it's doubled in size until gc.collect() arrives some time in the future.

Currently, we get delta peaked reported @ 126MB for models.resnet34, and it should be 0 peaked, if first the model is unloaded and then loaded again.

In [None]:
_=learn.load(f'reload1')

In [None]:
cpu_mem, gpu_mem, time_data = exp1.cl.data
if b2mb(gpu_mem.peaked_delta) > 10:
    alert(f"load() caused potential fragmentation by not unloading model first, delta peaked at {b2mb(gpu_mem.peaked_delta)} MB")

In [None]:
reclaim() # resets load's GPU RAM consumption

In [None]:
#reclaim_tensors()

In [None]:
learn.lr_find(end_lr=1)

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(1, max_lr=1e-2)

In [None]:
learn.save(f'leak2')

## Fine tuning

In [None]:
_=learn.load(f'leak2')

In [None]:
learn.unfreeze()

In [None]:
learn.lr_find()

In [None]:
# 2nd time to check for leaks
learn.lr_find()
# gpu delta consumed should be zero

In [None]:
learn.recorder.plot()

In [None]:
lr=1e-2
learn.fit_one_cycle(1, slice(1e-5, lr/5))
# XXX: something is wrong here measurement-wise
# callback reports peak of 942 vs. cell peak reports at 432 - the measuring thread of the cell probably missed that higher peak - need to switch to the new pytorch max_memory_allocated with reset_ to get the exact measurement. must wait for pytorch-1.0.1 to be out.
# 

In [None]:
learn.save(f'leak3')

In [None]:
learn.freeze()

In [None]:
learn.export()

# Inference via learn.export

In [None]:
learn = load_learner(path, test=ImageItemList.from_folder(path/'testing'))

In [None]:
learn.data.test_ds
len(learn.data.test_ds)

In [None]:
predictions = learn.get_preds(ds_type=DatasetType.Test)

In [None]:
len(predictions[0])

In [None]:
# get predictions as numpy
pred = predictions[0].numpy()
pred[0]
pred_df = pd.DataFrame(pred)
pred_df[:5]
#pred_df.sort_values(by="preds")

# Inference with manual learn re-construction

In [None]:
del learn
gc.collect()
#learn.opt.clear()
torch.cuda.empty_cache()

In [None]:
#del exp1

In [None]:
bs = 32
tfms = get_transforms(do_flip=False)


In [None]:
data = (src.add_test_folder(test_folder='testing')
        .transform(tfms) # .transform(tfms, size=256)
        .databunch().normalize(imagenet_stats))
learn = create_cnn(data, model)
_=learn.load(f'leak3')


In [None]:
# learn.data = (src.add_test_folder(test_folder='test')
#         .transform(tfms) # .transform(tfms, size=256)
#         .databunch().normalize(imagenet_stats))

In [None]:
learn.data.test_ds
len(learn.data.test_ds)

In [None]:
#learn.validate()

In [None]:
predictions = learn.get_preds(ds_type=DatasetType.Test)

# need to also try learn.TTA()

In [None]:
len(predictions[0])

In [None]:
# get predictions as numpy
pred = predictions[0].numpy()
pred[0]
pred_df = pd.DataFrame(pred)
pred_df[:5]
#pred_df.sort_values(by="preds")