# Modules and Packages

In [47]:
import pickle,csv
from dataloader import get_data

* [Python Standard Library](https://docs.python.org/3/library/) - Python runtime services,Generic Operating System,  Services, Debugging 
* Numpy, Matplotlib
* Pytorch, Tensorflow

# Data Sources  and Common data store formats

* Python objects - pkl 
* Numeric data - npz 
* Multi-data - csv 
* Plain text - txt 
* Large Datasets - HDF5 

In [48]:
import pickle
obj = { 'age':23,'hobbies':['photography','running','travelling'] }
pickle.dump(obj,open('store.pkl','wb'))

obj2 = pickle.load(open('store.pkl','rb'))
obj2

{'age': 23, 'hobbies': ['photography', 'running', 'travelling']}

In [49]:
import csv
import pprint
with open('data/iris.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        print(row)

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
['5.1', '3.5', '1.4', '0.2', 'setosa']
['4.9', '3', '1.4', '0.2', 'setosa']
['4.7', '3.2', '1.3', '0.2', 'setosa']
['4.6', '3.1', '1.5', '0.2', 'setosa']
['5', '3.6', '1.4', '0.2', 'setosa']
['5.4', '3.9', '1.7', '0.4', 'setosa']
['4.6', '3.4', '1.4', '0.3', 'setosa']
['5', '3.4', '1.5', '0.2', 'setosa']
['4.4', '2.9', '1.4', '0.2', 'setosa']
['4.9', '3.1', '1.5', '0.1', 'setosa']
['5.4', '3.7', '1.5', '0.2', 'setosa']
['4.8', '3.4', '1.6', '0.2', 'setosa']
['4.8', '3', '1.4', '0.1', 'setosa']
['4.3', '3', '1.1', '0.1', 'setosa']
['5.8', '4', '1.2', '0.2', 'setosa']
['5.7', '4.4', '1.5', '0.4', 'setosa']
['5.4', '3.9', '1.3', '0.4', 'setosa']
['5.1', '3.5', '1.4', '0.3', 'setosa']
['5.7', '3.8', '1.7', '0.3', 'setosa']
['5.1', '3.8', '1.5', '0.3', 'setosa']
['5.4', '3.4', '1.7', '0.2', 'setosa']
['5.1', '3.7', '1.5', '0.4', 'setosa']
['4.6', '3.6', '1', '0.2', 'setosa']
['5.1', '3.3', '1.7', '0.5', 'setosa']
['4.

# Data Containers

Python offers a variety of containers each dedicated for different purpose and constrained to harness certain optimisations
* lists - generic container , numeric indexing
* tuples - immutable lists 
* dictionaries - key-value organisation 
* sets - collection of unique elements

## Lists
Pay attention as these are techniques to handle data pre-processing and manipulation in the batch loading phase 

In [50]:
homo_list = [12,45,900,78,34,66,17,85]
hetero_list = [10,'foo',1.3]
print(hetero_list[0])
tuple_list = [
                (1,'Erebor',800.45),
                (2,'Rivendell',500.67),
                (3,'Shire',900.12),
                (4,'Mordor',1112.30)
            ]

10


**Note** : Lists in batched data processing are particularly lists of tuples  
batch_instance = (utter,utterance_length,transcript,transcript_lens)

### Operations

In [51]:
l3 = homo_list * 2
l3

[12, 45, 900, 78, 34, 66, 17, 85, 12, 45, 900, 78, 34, 66, 17, 85]

This is different from the result you'd get when operating on numpy

In [52]:
homo_list + hetero_list

[12, 45, 900, 78, 34, 66, 17, 85, 10, 'foo', 1.3]

`sorted`, `sum`,`max`,`min`

In [53]:
print(sorted(homo_list))
print(sum(homo_list))

[12, 17, 34, 45, 66, 78, 85, 900]
1237


###  Conditional operations - filtering:
There are two ways to filter lists:
* Index based - Slicing and Dicing
* Condition based - List comprehension

# Slicing and Dicing
` sliced_list = [ start_idx : end_idx+1 : step]`

In [54]:
print(homo_list)
print(homo_list[:5])
print(homo_list[-1:0:-2])

[12, 45, 900, 78, 34, 66, 17, 85]
[12, 45, 900, 78, 34]
[85, 66, 78, 45]


#### List comprehension

`*result*  = [*transform*    *iteration*         *filter*     ]` 
~~~~
res = [ manipulation(instance[2]) for instance in sorted_dataset ]
~~~~

In [55]:
res = [no for no in homo_list if no>50]
res

[900, 78, 66, 85]

In [56]:
%%timeit  
res = [i for i in range(10000)]

1.28 ms ± 29.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [57]:
%%timeit
res = []
for i in range(10000):
    res.append(i)

2.23 ms ± 88.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Usecase: Data Preprocessing and Loading


In [58]:
batch_dataset = get_data()
print(type(batch_dataset[0]))

## (utterance,utterance_size,transcripts,transcripts_size)
batch_dataset[0]

<class 'tuple'>


(array([[ -2.7760592 , -10.653754  ,  -9.3995695 , ...,   0.2363553 ,
          -0.5805931 ,  -0.8171587 ],
        [ -2.2426343 ,  -9.265765  ,  -9.315787  , ...,  -0.26111507,
          -0.46208572,  -0.9445448 ],
        [ -2.7435112 ,  -6.7105646 , -11.795384  , ...,  -0.6318717 ,
          -0.56550837,  -1.3585529 ],
        ...,
        [ -6.937312  , -19.204508  , -24.954329  , ...,  -1.6914577 ,
          -1.5678849 ,  -1.5754833 ],
        [ -6.4351797 , -18.217642  , -20.86373   , ...,  -2.0262208 ,
          -1.7305894 ,  -1.2664866 ],
        [ -5.921312  , -16.417336  , -19.451906  , ...,  -2.7078733 ,
          -2.3750868 ,  -2.1822453 ]], dtype=float32),
 402,
 array([25, 13, 10,  1, 11, 10, 18,  6, 17, 10,  1, 21, 23, 20,  9, 26,  8,
        10, 24,  1,  6,  1, 17, 14, 25, 25, 10, 23,  1, 20, 11,  1, 25, 28,
        20,  1, 25, 20,  1, 11, 20, 26, 23,  1, 30, 20, 26, 19, 12,  1, 14,
        19,  1, 19, 20, 27, 10, 18,  7, 10, 23,  1,  6, 19,  9,  1,  9, 10,
         8, 

In [59]:
# sorting
sorted_dataset = sorted(batch_dataset,key=lambda x: x[1])

# max
max_transcript_len = max(batch_dataset,key=lambda x: x[3] )[3]

#list comprehension for extraction
transcripts = [ (instance[2],instance[3]) for instance in sorted_dataset]

#list comprehension for manipulation 
"""
Returns transpose of matrix
"""    
def manipulation(data):
    return data.T

pad_len = [ manipulation(instance[2]) for instance in sorted_dataset ]
pad_len[0]

array([25, 13, 10,  1, 21, 20, 21, 26, 17,  6, 25, 14, 20, 19,  1, 17, 14,
       27, 10, 24,  1,  7, 30,  1, 13, 10, 23,  9, 14, 19, 12,  1, 12, 20,
        6, 25, 24,  1,  6, 19,  9,  1, 24, 13, 10, 10, 21,  1, 20, 23,  1,
        7, 30,  1, 25, 23,  6,  9, 14, 19, 12])

### Classes

Specifically useful for datasets that are supposed to be 'iterable'

### Iterable and Iterators

In [60]:
class IterableADT:
    
    def __init__(self,train_data_src,train_data_src2, train_label_src):
        self.x = train_data_src
        self.x2 = train_data_src2
        self.y = train_label_src
        assert len(self.x) == len(self.x2)
        assert len(self.x2) == len(self.y)
    
    def __len__(self):
        return len(self.x)

    def __getitem__(self,key):
        return (self.x[key],self.x2[key],self.y[key])
    

### Generators
Instead of creating classes for iterators , you can use the generator 
Generators relieve the developer of recording the state of the iteration 
Simplistically, generators are functions that use `yield` statement instead of `return`


In [61]:
def pairwise_generator(input_data):
    for i in range(0,len(input_data),2):
        yield (input_data[i],input_data[i+1])

data = [1,'one',2,'two',3,'three',4,'four',5,'five']        
generator = pairwise_generator(data)
for elt in generator:
    print(elt)

(1, 'one')
(2, 'two')
(3, 'three')
(4, 'four')
(5, 'five')


# Debugging - Pdb

In [62]:
import pdb
def pairwise_generator(input_data):
    pdb.set_trace()
    for i in range(0,len(input_data),2):        
        yield (input_data[i],input_data[i+1])
        
data = [1,'one',2,'two',3,'three',4,'four']        
generator = pairwise_generator(data)
for elt in generator:
    print(elt)

> <ipython-input-62-327e211f3ec6>(4)pairwise_generator()
-> for i in range(0,len(data),2):
(Pdb) n
> <ipython-input-62-327e211f3ec6>(5)pairwise_generator()
-> yield (input_data[i],input_data[i+1])
(Pdb) n
(1, 'one')
> <ipython-input-62-327e211f3ec6>(4)pairwise_generator()
-> for i in range(0,len(data),2):
(Pdb) elt
(1, 'one')
(Pdb) input_data[i]
1
(Pdb) n
> <ipython-input-62-327e211f3ec6>(5)pairwise_generator()
-> yield (input_data[i],input_data[i+1])
(Pdb) input_data[i]
2
(Pdb) n
(2, 'two')
> <ipython-input-62-327e211f3ec6>(4)pairwise_generator()
-> for i in range(0,len(data),2):
(Pdb) elt
(2, 'two')
(Pdb) n
> <ipython-input-62-327e211f3ec6>(5)pairwise_generator()
-> yield (input_data[i],input_data[i+1])
(Pdb) n
(3, 'three')
> <ipython-input-62-327e211f3ec6>(4)pairwise_generator()
-> for i in range(0,len(data),2):
(Pdb) n
> <ipython-input-62-327e211f3ec6>(5)pairwise_generator()
-> yield (input_data[i],input_data[i+1])
(Pdb) c
(4, 'four')
