# In which the mean and std-dev of the datasets are computed.

In [4]:
import os
from glob import glob
from pathlib import Path
import math

from PIL import Image

import numpy as np

import torch
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

Retrieve dataset

In [2]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [None]:
%%bash

# copying stuff to the colab vm
echo "Making dirs"
mkdir data
cd data
mkdir bsd-b
cd bsd-b
echo "Copying archive"
cp /content/drive/MyDrive/VP/Datasets/BSD_B_Centroid.tar.gz .
echo "Decompressing"
tar -xvf BSD_B_Centroid.tar.gz > /dev/null
echo "Done"

Making dirs
Copying archive
Decompressing
Done


In [8]:
%%bash

# copying stuff to the colab vm
echo "Making dirs"
mkdir data
cd data
mkdir realblur
cd realblur
echo "Copying archive"
# cp /content/drive/MyDrive/VP/Datasets/RealBlur.tar.gz .
echo "Decompressing"
tar -xvf RealBlur.tar.gz > /dev/null
echo "Done"

Making dirs
Copying archive
Decompressing
Done


mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘realblur’: File exists


In [None]:
%%bash

# copying stuff to the colab vm
echo "Making dirs"
mkdir data
cd data
mkdir realblur-tele
cd realblur-tele
echo "Copying archive"
cp /content/drive/MyDrive/VP/Datasets/RealBlur-Tele.tar.gz .
echo "Decompressing"
tar -xvf RealBlur-Tele.tar.gz > /dev/null
echo "Done"

Making dirs
Copying archive
Decompressing
Done


## Dataset(s)

### BSD-B

In [None]:
bsdb_path = "./data/bsd-b/"


class BSD_B_Dataset(Dataset):
    def __init__(self, path):
        self.root_path = Path(path)
        self.list_fname = self.root_path / "BSB_B_Centroid_train.txt"    # I guess BSB (rather than BSD) is a typo on their part?

        self.gt_fnames = []
        self.blur_fnames = []

        with open(self.list_fname) as f:
            for line in f:
                pair = line.split()
                self.gt_fnames.append(self.root_path / pair[0])
                self.blur_fnames.append(self.root_path / pair[1])

    def __len__(self):
        return len(self.gt_fnames)
    
    def __getitem__(self, idx):
        # open w/ PIL (no chance of BGR even by mistake! ^_^)
        # shape: (width, height, channel)
        gt = Image.open(self.gt_fnames[idx])
        blur = Image.open(self.blur_fnames[idx])

        # transform to numpy arrays, and byte to float
        gt = np.array(gt).astype(float) / 256.0
        blur = np.array(blur).astype(float) / 256.0

        return gt, blur

# testing this cell...
# TODO remove
d = BSD_B_Dataset(bsdb_path)
d[0][0].shape

---- 0.0 ,, 0.99609375


### RealBlur (+Tele)

In [5]:
realblur_path = "./data/realblur/"
realblurtele_path = "./data/realblur-tele/"

class RealBlurDataset(Dataset):
  def __init__(self, path, listname):

    self._base_dir = path
    train_file = self._base_dir + listname
    self._data_path = []
    with open(train_file, "r") as f:
      for line in f.readlines():
        s = line.split(" ")
        self._data_path.append([f"{self._base_dir}/{s[0]}", f"{self._base_dir}/{s[1]}"])
      f.close()

  def __getitem__(self, index):
    real = Image.open(self._data_path[index % len(self._data_path)][0])
    blurred = Image.open(self._data_path[index % len(self._data_path)][1])

    real = np.array(real).astype(float) / 256.0
    blurred = np.array(blurred).astype(float) / 256.0

    return real, blurred

  def __len__(self):
    return len(self._data_path)

## Computation

Which should be dataset-independent

In [6]:
# Computes the mean and std for each channel, simply by summing all pixels of all images (blur and gt alike)
# the variance is computed as E[x^2] - E[x]^2 to avoid a double loop
def ds_mean_and_std(dataset, N=None):
    # accumulators
    px_count = 0
    px_sums = [0,0,0]       # per channel
    px_sqr_sums = [0,0,0]   # per channel

    if N is None:
        N = len(dataset)   # the correct value, but I want the ability to run less of the dataset to test this function

    for ds_i in tqdm(range(N)):
        image_pair = dataset[ds_i]
        for img in image_pair:    # shape: (width, height, channel)
            # simple accumulation. An img has width*height pixels.
            px_count += img.shape[0] * img.shape[1]
            for chan_i in range(3):
                channel = img[:,:,chan_i]   # slice
                # accumulate per-channel.
                px_sums[chan_i] += np.sum(channel)
                px_sqr_sums[chan_i] += np.sum(channel**2)      # the square is applied per-element, as it should
    
    mean = [px_sums[i] / px_count for i in range(len(px_sums))]
    var = [(px_sqr_sums[i] / px_count) - (mean[i] ** 2) for i in range(len(mean))]
    std = [np.sqrt(var[i]) for i in range(len(var))]

    return mean, std

## Statistics beget normalization

In [None]:
# dataset = BSD_B_Dataset(bsdb_path)
# dataset = RealBlurDataset(realblur_path, "RealBlur_J_train_list.txt")
dataset = RealBlurDataset(realblurtele_path, "RealBlur_J_Tele_test_list.txt")

ds_mean_and_std(dataset)

## Inspecting the resulting data

In which I note the obtained results to save for future use.

### BSD-B Dataset

```python
mean = [0.4355969288502789, 0.43763443105665034, 0.3649651865988699]
std = [0.23582929701856753, 0.2207150368759555, 0.23132070247996644]
```

*(in 8 minutes)*

### RealBlur (J) Dataset

```python
mean = [0.2565184621726687, 0.24154863254378398, 0.20631386856996806]
std = [0.24273459459584631, 0.23389855206195417, 0.2270960826428365]
```

*(in 5 minutes)*

### RealBlur-Tele (J) Dataset

```python
mean = [0.3148849333529864, 0.285131326503854, 0.2496840560487442]
std = [0.22968544168045696, 0.21416453673706182, 0.21855125453805085]
```

*(in 1 minute)*

### ImageNet

These are what you usually find online, and they are sometimes passable with other natural-scenes dataset.

Used to use them, now obviously not anymore. Noting them here so I don't forget them, just in case.

```python
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
```