# Data Loading and Processing (pytorch)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset

In [104]:
# import images
import skimage import io, transform

In [92]:
import pathlib as plib
import pdb

In [24]:
a = np.array([1,2,3,4,5,6]); print(a.shape)

(6,)


In [25]:
a.reshape(-1,2)

array([[1, 2],
       [3, 4],
       [5, 6]])

In [26]:
ls data

breast-cancer-wisconsin.data.csv  names_train.csv.gz
diabetes.csv.gz                   [1m[34mprocessed[m[m/
[1m[34mdogscats[m[m/                         [1m[34mraw[m[m/
dogscats.zip                      shakespeare.txt.gz
names_test.csv.gz


In [10]:
Dataset.__init__

[0;31mInit signature:[0m [0mDataset[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
An abstract class representing a Dataset.

All other datasets should subclass it. All subclasses should override
``__len__``, that provides the size of the dataset, and ``__getitem__``,
supporting integer indexing in range from 0 to len(self) exclusive.
[0;31mFile:[0m           ~/anaconda3/envs/fastai-cpu/lib/python3.6/site-packages/torch/utils/data/dataset.py
[0;31mType:[0m           type


In [86]:
class BreastCancerDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        """
        - csv_file (string): path to the csv file
        - root_dir (string): directory with all the images
        - transform (callable, optional): optional transform to be applied to a sample
        """
        all_data = pd.read_csv(csv_file, header=None)
        self.data = np.array(all_data.iloc[:, 1:-1]) #ignore first column (patient ID)
        self.target = np.array(all_data.iloc[:, -1].map({2:0, 4:1}))
        self.root_dir = root_dir
        self.transform = transform
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        "returns a sample of format (feature 1,...,featureD, class)"
        return (self.data[idx, :], self.target[idx])
    

In [87]:
fpath = './data/breast-cancer-wisconsin.data.csv'
dataset = BreastCancerDataset(fpath, './data')
a = dataset[10]

In [88]:
a

(array([1, 1, 1, 1, 1, '1', 3, 1, 1], dtype=object), 0)

In [84]:
type(dataset.data.iloc[10,:])

pandas.core.series.Series

In [89]:
    dataset[40]

In [99]:
def count_files(dirpath, fmt=None):
    """
    Args
    - dirpath (Path object): directory to inspect
    - fmt (string): file format including '.', eg: '.png', '.jpg', '.csv'
    Returns a dictionary of counts for each filetype in dirpath"""
    import collections
    counts =  collections.Counter(f.suffix for f in dirpath.iterdir())
    if fmt is None:
        return counts
    else:
        return counts[fmt]

In [None]:
class DogCatDataset(Dataset):
    def __init__(self, dirpath, animalType):
        """
        Args
        - dirpath (Path object): path to a folder that contains 
        'dogs' and 'cats' subfolers. For instance, `data/dogscats/train` or 
        `data/dogscats/test`
        - animalType (string): 'dog' or 'cat'
        
        Sample of this dataset will be a dictionary of {"image": img, "class": 0 or 1}
        """
        self.animalType = animalType
        self.animalClass = 0 if animalType == 'dog' else 1
        self.full_dirpath = dirpath / (animalType + "s")
        self.num_data = count_files(self.full_dirpath, fmt='.jpg')
    def __len__(self):
        return self.num_data
    def __getitem__(self, idx):
        img_name = self.full_dirpath / f'{self.animalType}.{idx}.jpg'
        
    
        
        

## Dataset and DataLoader Rhythm
1. Define custom dataset
  - __init__(self): download, read data, etc
  - __getitem__(self, idx): return one item on the index
  - __len__(self): return the data length
  
  
2. Creat a new dataloader instance by passing the custom dataset and batch size, etc

### Template custom Dataset class: [resource](https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel.html)
Two python dictionary, `partition` and `labels`  
- `partition['train']` = a list of string ids for training data points    
  `partition['dev']` = a list of string ids for dev data points  
  `partition['test']` = a list of string ids for test points  

- labels:  `labels[somdID]` = label of the `someID` data point for someID in allIDs



In [109]:
# 1. custom dataset
class OldDiabetesDataset(Dataset):
    def __init__(self):
        xy = np.loadtxt('./data/diabetes.csv.gz', delimiter=',', dtype=np.float32)
        self.x_data = xy[:,:-1]
        self.y_data = xy[:, -1]
        self.len = xy.shape[0]
        
    def __getitem__(self, idx):
        "a sample is [x_data[idx], y_data[idx]]"
        return self.x_data[idx], self.y_data[idx]
    
    def __len__(self):
        return self.len

In [119]:
class DiabetesDataset(Dataset):
    def __init__(self, ids):
        # Ideally we wouldn't load all data here.
        # Rather, read each file when __getitem__ is called
        xy = np.loadtxt('./data/diabetes.csv.gz', delimiter=',', dtype=np.float32)
        self.x_data = xy[:,:-1]
        self.y_data = xy[:, -1]
        self.len = xy.shape[0]
        
        self.ids = ids
        
    def __getitem__(self, index):
        "Returns a sample in the format of [x_data[idx], y_data[idx]]"
        ID = self.ids[index]
        return self.x_data[ID], self.y_data[ID]
    
    def __len__(self):
        return self.len
    

In [114]:
# Partition the datapoint indices into train, dev, test groups.
xy = np.loadtxt('./data/diabetes.csv.gz', delimiter=',', dtype=np.float32)
labels = xy[:,-1]
RSEED = 11
from sklearn.model_selection import train_test_split
tr_dev_ids, test_ids = train_test_split(labels, test_size=0.33, random_state=RSEED) #stratified 
train_ids, dev_ids = train_test_split(tr_dev_ids, test_size=0.5, random_state=RSEED)

In [120]:
partition = {'train': train_ids, 'dev': dev_ids, 'test': test_ids}
train_dataset = DiabetesDataset(partition['train'])
dev_dataset = DiabetesDataset(partition['dev'])
test_dataset = DiabetesDataset(partition['test'])

In [None]:
#2. new dataloader instance
data_params = {'batch_size': 32,
              'shuffle': True,
              'num_workers':2}
train_loader = DataLoader(dataset=train_dataset, **data_params)
