# PyTorch

In [None]:
# %pip install torch torchvision

In [10]:
from torchvision import datasets, transforms

dataset = datasets.MNIST('data', train=False, download=True, transform=transforms.ToTensor())

In [None]:
dataset.__dict__.keys()

In [None]:
dataset.transform, dataset.target_transform

In [None]:
dataset.data.shape, dataset.targets.shape, dataset.train

In [None]:
print(dataset)
type(dataset)

In [None]:
len(dataset)

The `torchvision` library contains the modules datasets, dataloaders and transforms.  

`MNIST` is designed as a child of datasets.  
When loaded, it will download the MNIST dataset and store it within the working directory (here we're in `tests/notebooks/` of the main repo). The following directories and files will be created t10k-images-idx3-ubyte     t10k-labels-idx1-ubyte     train-images-idx3-ubyte     train-labels-idx1-ubyte
t10k-images-idx3-ubyte.gz  t10k-labels-idx1-ubyte.gz  train-images-idx3-ubyte.gz  train-labels-idx1-ubyte.gz

```text
notebooks/ # -- the wd of this notebook --
└── data/
    └── MNIST/
        └── raw/
            ├── t10k-images-idx3-ubyte  
            ├── t10k-labels-idx1-ubyte
            ├── train-images-idx3-ubyte
            ├── train-labels-idx1-ubyte
            ├── t10k-images-idx3-ubyte.gz
            ├── t10k-labels-idx1-ubyte.gz
            ├── train-images-idx3-ubyte.gz
            └── train-labels-idx1-ubyte.gz

```

In [None]:
bool_tensor=data_w_transform.data == data_wo_transform.data

# check if there is one False in it
bool_tensor.all()

In [None]:
data_w_train=datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor())
data_wo_train=datasets.MNIST('data', train=False, download=True, transform=transforms.ToTensor())

data_w_train.data.shape, data_wo_train.data.shape

In [27]:
data_wo_transform=datasets.MNIST('data', train=False)
data_w_transform=datasets.MNIST('data', train=False, transform=transforms.ToTensor())

In [None]:
data_wo_transform.data.shape

In [None]:
data_default=datasets.MNIST('data')

In [None]:
data_default.train, len(data_default)   

In [None]:
# -- dataset exploration
print(dataset.data.shape)
print(dataset.targets.shape)
dataset.targets.unique() #type tensor (both data - X - and targets - y)

In [None]:
dataset.data.shape

In [None]:
dataset_default = datasets.MNIST(root='data')
print(f'the default args:\n\troot:{dataset_default.root}\n\ttransform:{dataset_default.transform}\n\ttarget_transform:{dataset_default.target_transform}\n\ttransforms:{dataset_default.transforms}\n\ttrain:{dataset_default.train}\n\tdata:{dataset_default.data}\n\ttargets:{dataset_default.targets}')

# Our pytorch-wannabe lib code

## Data retrieval and loading

Main tasks:  
* retrieve data from web  
* save it in directory  
* transform image to tensor through a list intermediate

### Image transformation

#### playground

In [8]:
from PIL import Image
import numpy as np

In [4]:
#fucntion that takes an image and converst it to a list of pixels using PIL
image_path='../test/ibm-nn.png'
image = Image.open(image_path)

In [None]:
image = Image.open(image_path)

# Convert image to RGB mode (if not already in RGB)
image = image.convert("RGB")

# Get the pixel data as a 3D list
pixels = list(image.getdata())

# Reshape into a 3D list [rows][columns][RGB]
width, height = image.size
pixels_3d = [pixels[i * width:(i + 1) * width] for i in range(height)]

# Print or access the pixel data
# print(pixels_3d[0][0])  # First pixel's RGB values
np.array(pixels_3d).shape

In [None]:
np.array(pixels_3d).shape

In [None]:
image = Image.open(image_path)

# Convert image to L mode
image = image.convert("L")

# Get the pixel data as a 3D list
pixels = list(image.getdata())

# Reshape into a 2D liat
width, height = image.size
pixels_2d = [pixels[i * width:(i + 1) * width] for i in range(height)]

# Print or access the pixel data
# print(pixels_2d)  # First pixel's RGB values
np.array(pixels_2d).shape

In [None]:
image.convert("RGB")==image

In [None]:
image.size

In [None]:
pixels=list(image.convert("RGB").getdata())
pixels

pixels = [list(pixels[i * image.width:(i + 1) * image.width]) for i in range(image.height)]
pixels

In [None]:
import cv2 #quite faster than PIL

# Load the image
image_path = "../ibm-nn.png"  # Replace with your image file path
image = cv2.imread(image_path)

# Convert BGR (OpenCV default) to RGB
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Convert the NumPy array to a Python list
pixels_3d = image.tolist()

# Print or access the pixel data
print(pixels_3d[0][0])  # First pixel's RGB values


In [None]:
len(pixels_3d) #height
len(pixels_3d[0]) #width
len(pixels_3d[0][0]) #RGB - nb of channels (3d)

In [None]:
import requests
from pathlib import Path
import gunzip

path=Path('data/test')
path.mkdir(exist_ok=True)

url='https://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz'
filename=url.split('/')[-1]
filepath=path/filename

# check if file i present
if filepath.exists():
    print(f'{filepath} already exists')
else:
    print(f'downloading {filename}')
    r=requests.get(url)
    filepath.write_bytes(r.content)

In [None]:
import requests

url = "https://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"
output_file = "t10k-labels-idx1-ubyte.gz"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

try:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        with open(output_file, "wb") as f:
            f.write(response.content)
        print(f"File downloaded successfully as '{output_file}'.")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
p='../test/ibm-nn.png'
P=Path(p)
P

import os
os.path.exists(P)

# os.mkdir()

In [None]:


#gunzipping and checking before if guznipped
#make a filename without the .gz extension
filename_no_gz=filename.replace('.gz','')
filepath_no_gz=path/filename_no_gz
if filepath_no_gz.exists():
    print(f'{filepath_no_gz} already exists')
else:
    with open(filepath, 'rb') as f:
        file_content = f.read()
        gunzip_content = gunzip.decompress(file_content)
        with open(filepath_no_gz, 'wb') as f:
            f.write(gunzip_content)

In [None]:
import gzip
import shutil

def decompress_gz(filepath):
    filepath_no_gz = filepath.rstrip('.gz')
    with gzip.open(filepath, 'rb') as f_in:
        with open(filepath_no_gz, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

# Example usage
decompress_gz(filepath)

In [None]:
import gzip
import shutil
import os

def decompress_gz(input_gz_path):
    """
    Decompresses a .gz file to a regular file with the same name minus the .gz extension.

    Args:
        input_gz_path (str): Path to the .gz compressed file.

    Returns:
        str: Path to the decompressed file.
    """
    # Derive the output file path by removing the .gz extension
    if not input_gz_path.endswith('.gz'):
        raise ValueError("Input file must have a .gz extension.")
    
    output_file_path = input_gz_path[:-3]
    
    try:
        with gzip.open(input_gz_path, 'rb') as gz_file:
            with open(output_file_path, 'wb') as out_file:
                shutil.copyfileobj(gz_file, out_file)
        print(f"Decompressed '{input_gz_path}' to '{output_file_path}' successfully.")
        return output_file_path
    except FileNotFoundError:
        print(f"Error: File '{input_gz_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage:
# decompress_gz('example.gz')

decompress_gz(filepath)

In [None]:
%pip install gunzip

In [None]:
import requests
from pathlib import Path
import gzip

path=Path('data/test')
path.mkdir(exist_ok=True)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
url='https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz'
filename=url.split('/')[-1]
filepath=path/filename

try:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        with open(filepath, "wb") as f:
            f.write(response.content)
        print(f"File downloaded successfully as '{filepath}'.")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
except Exception as e:
    print(f"An error occurred: {e}")

In [14]:
filename_no_gz=filename.replace('.gz','')
filepath_no_gz=path/filename_no_gz
if filepath_no_gz.exists():
    print(f'{filepath_no_gz} already exists')
else:
    with open(filepath, 'rb') as f:
        file_content = f.read()
        gunzip_content = gzip.decompress(file_content)
        with open(filepath_no_gz, 'wb') as f:
            f.write(gunzip_content)