In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

from skimage import io, transform

import matplotlib.pyplot as plt # for plotting
import numpy as np
import pandas as pd
import glob
import os
from tqdm import tqdm

from IPython.display import Image
import cv2

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("A2.2-Data/public_test.csv",header = None)

In [3]:
data = data.iloc[:,1:]
data.to_csv("A2.2-Data/public_test_data.csv",index=False, header=False)

In [4]:
data.describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,3063,3064,3065,3066,3067,3068,3069,3070,3071,3072
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,129.174,128.68,129.70825,130.631,131.489,131.87875,132.45325,132.951,133.9895,134.67025,...,113.931,114.12975,114.76575,114.60725,114.442,114.9475,115.1775,115.231,115.141,115.74
std,73.535349,72.529737,72.471064,71.911158,71.555195,71.408318,71.363583,71.379184,71.317624,71.555004,...,63.296341,63.221497,63.57414,63.599105,63.78168,63.822943,64.205242,64.397728,64.599368,65.228963
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,68.0,69.0,72.0,74.0,75.0,75.0,76.0,75.0,77.0,76.0,...,65.0,66.0,67.0,67.0,67.0,66.0,66.0,66.0,65.0,66.0
50%,124.0,124.0,126.0,126.0,127.0,128.0,130.0,130.0,132.0,133.0,...,107.0,106.0,107.0,106.0,106.0,106.0,107.0,107.0,107.0,107.0
75%,187.0,186.0,188.0,189.0,188.0,188.0,190.0,190.0,191.0,192.0,...,157.0,157.0,158.0,158.0,158.0,158.0,158.0,159.0,158.0,160.0
max,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,...,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


In [None]:
# DataLoader Class
# if BATCH_SIZE = N, dataloader returns images tensor of size [N, C, H, W] and labels [N]
class ImageDataset(Dataset):
    
    def __init__(self, data_csv, train = True , img_transform=None):
        """
        Dataset init function
        
        INPUT:
        data_csv: Path to csv file containing [data, labels]
        train: 
            True: if the csv file has [labels,data] (Train data and Public Test Data) 
            False: if the csv file has only [data] and labels are not present.
        img_transform: List of preprocessing operations need to performed on image. 
        """
        
        self.data_csv = data_csv
        self.img_transform = img_transform
        self.is_train = train
        
        data = pd.read_csv(data_csv, header=None)
        if self.is_train:
            images = data.iloc[:,1:].to_numpy()
            labels = data.iloc[:,0].astype(int)
        else:
            images = data.iloc[:,:]
            labels = None
        
        self.images = images
        self.labels = labels
        print("Total Images: {}, Data Shape = {}".format(len(self.images), images.shape))
        
    def __len__(self):
        """Returns total number of samples in the dataset"""
        return len(self.images)
    
    def __getitem__(self, idx):
        """
        Loads image of the given index and performs preprocessing.
        
        INPUT: 
        idx: index of the image to be loaded.
        
        OUTPUT:
        sample: dictionary with keys images (Tensor of shape [1,C,H,W]) and labels (Tensor of labels [1]).
        """
        image = self.images[idx]
        image = np.array(image).astype(np.uint8).reshape(32, 32, 3)
        
        if self.is_train:
            label = self.labels[idx]
        else:
            label = -1
        
        image = self.img_transform(image)
        
        sample = {"images": image, "labels": label}
        return sample

In [None]:
# Data Loader Usage

BATCH_SIZE = 200 # Batch Size. Adjust accordingly
NUM_WORKERS = 20 # Number of threads to be used for image loading. Adjust accordingly.

img_transforms = transforms.Compose([transforms.ToPILImage(),transforms.ToTensor()])

# Train DataLoader
train_data = "" # Path to train csv file
train_data = "/mnt/scratch1/siy197580/COL341/cifar/train_data.csv"
test_data = "/mnt/scratch1/siy197580/COL341/cifar/public_test.csv"
train_dataset = ImageDataset(data_csv = train_data, train=True, img_transform=img_transforms)
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

# Test DataLoader
test_data = "" # Path to test csv file
train_data = "/mnt/scratch1/siy197580/COL341/cifar/train_data.csv"
test_data = "/mnt/scratch1/siy197580/COL341/cifar/public_test.csv"
test_dataset = ImageDataset(data_csv = test_data, train=True, img_transform=img_transforms)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

In [None]:
# Enumeration for 1 epoch
for batch_idx, sample in enumerate(train_loader):
    images = sample['images']
    labels = sample['labels']