# Set Environment

In [1]:
import numpy as np
from numpy import random

import copy
from collections import Counter

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline      import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.base          import BaseEstimator, TransformerMixin

import matplotlib.pyplot as plt

### Initializing a test data

array([[0, 1, 2, 3, 4, 5, 6],
       [0, 1, 2, 3, 4, 5, 6]])

In [78]:
N_SAMPLE = 2
N_MARKER = 3
N_ROW    = [2, 5]

MATRIX = np.array([random.randn(nrow, N_MARKER) for nrow in N_ROW])
LABEL  = np.array([0, 1])
COORD  = np.array([
    np.arange(np.sum(N_ROW)) 
    for _ in range(N_SAMPLE)]).T

print("# Labels: ", LABEL.shape)
print("# Samples:", MATRIX.shape)
print("===========================")
print("Sample 01:", MATRIX[0].shape)
print("Sample 02:", MATRIX[1].shape)
print("===========================")
print("labels:", LABEL)
print("===========================")
print("Samples:\n"); print(MATRIX)

# Labels:  (2,)
# Samples: (2,)
Sample 01: (2, 3)
Sample 02: (5, 3)
labels: [0 1]
Samples:

[array([[ 1.35783517, -1.14786721, -0.68554454],
       [ 0.32674818,  0.61081051,  0.82888398]])
 array([[ 0.04449691, -0.5109881 ,  0.71315925],
       [ 0.912753  ,  0.53297061, -1.73570579],
       [ 0.32142442, -0.85487087,  1.84871604],
       [ 0.06598704,  0.73085306,  2.93833743],
       [-0.31872737,  0.20856326, -0.85830712]])]


In [69]:
class Data_Wrapper:
    """ A wrapper with data matrix (N, M_i, p) and data labels
    N   = number of samples
    M_i = number of observations / data points in the ith sample
    p   = number of variables for all sample
    Args:
        matrix (N, M_i, p): data values
        label  (N,):        labels
        coord  (Sum(M_i)):  
    """
    def __init__(self, matrix, label, coord = None):
        
        # index of each sample
        idx = [x.shape[0] for x in matrix]
        idx = np.r_[0, idx]
        idx = np.cumsum(idx)
        self.sample_idx = idx
        
        # matrix and label
        self.matrix = np.vstack(matrix)
        self.label  = np.array(label)
        
        # coordinate
        self.coord = np.array(coord)
        assert self.coord.shape[0] == self.matrix.shape[0], "Dimension of matrix and coordinate does not fit"
        

    def __repr__(self):
        string = ""
        string += ("Number of samples:            " + str(self.get_num_sample())   + "\n")
        string += ("Number of variables:          " + str(self.get_num_variable()) + "\n")
        string += ("Number of total observations: " + str(self.matrix.shape[0])    + "\n")
        
        tmp = Counter(self.label)
        for k, v in tmp.items():
            string += ("Label " + str(k) + " : " + str(v) + "\n")
    
        return string
    
    def get_num_sample(self):
        return self.label.shape[0]
    
    def get_num_variable(self):
        return self.matrix.shape[1]
    
    def get_num_total_obs(self):
        return self.matrix.shape[0]
    
    def get_sample_size(self):
        return self.sample_idx[1:] - self.sample_idx[:-1]
    
    def get_sample(self, k):
        return self.matrix[self.sample_idx[k] : self.sample_idx[k + 1]] 
    
    def get_label(self, k):
        return self.label[k]
    
    def get_coord(self, k):
        return self.coord[self.sample_idx[k] : self.sample_idx[k + 1]] 
        
    def set_matrix(self, matrix):
        assert self.matrix.shape == matrix.shape
        self.matrix = matrix
    
    def set_coord(self, coord):
        # Check to make sure coord has the same number of samples
        assert self.matrix.shape[0] == coord.shape[0]
        self.coord = coord

In [79]:
flow_data = Data_Wrapper(MATRIX, LABEL, COORD)
print(flow_data)
print(flow_data.get_sample_size())

Number of samples:            2
Number of variables:          3
Number of total observations: 7
Label 0 : 1
Label 1 : 1

[2 5]


# Subsampling

subsample without replacement

In [14]:
num_subsample      = 2
num_subsample_size = 3
num_sample_sizes   = [4, 100]

np.vstack([
    random.permutation(num_sample_size)[:num_subsample_size]
    for num_sample_size in num_sample_sizes  # outter for loop: foreach sample
    for _ in range(num_subsample)])  # inner  for loop: create subsamples

array([[ 0,  1,  2],
       [ 0,  2,  3],
       [ 8, 84, 39],
       [61,  3, 90]])

subsample with replacement

In [15]:
num_subsample      = 2
num_subsample_size = 3
num_sample_sizes   = [2, 100]

np.vstack([
    # generate indices of a subsample
    random.randint(
        low = 0,
        high = num_sample_size, 
        size = num_subsample_size)
    for num_sample_size in num_sample_sizes  # outter for loop: foreach sample
    for _ in range(num_subsample)])          # inner  for loop: create subsamples

array([[ 0,  0,  0],
       [ 1,  0,  1],
       [93, 29,  9],
       [24, 44, 62]])

design transformer

In [85]:
class Transform_subsampling(BaseEstimator, TransformerMixin):
    def __init__(self, num_subsample = None, num_subsample_size = None, random_state = 0):
        self.indices = None
        self.num_subsample = num_subsample
        self.num_subsample_size = num_subsample_size
        
    def fit(self, dat, y = None):
        num_sample_sizes = dat.get_sample_size()     # array contain nrows of each sample
        num_subsample = self.num_subsample           # number of subsample created
        num_subsample_size = self.num_subsample_size # the size of each subsample
        
        self.indices = np.vstack([
            
            # generate indices of a subsample
            random.randint(
                low = 0,
                high = num_sample_size, 
                size = num_subsample_size)
            for num_sample_size in num_sample_sizes  # outter for loop: foreach sample
            for _ in range(num_subsample)])          # inner  for loop: create subsamples
        
        return self

    def transform(self, dat):
        # initialization
        lst_label  = []
        lst_matrix = []
        lst_coord  = []

        for idx_sample in range(flow_data.get_num_sample()):
            label  = flow_data.get_label(idx_sample)
            sample = flow_data.get_sample(idx_sample)
            coord  = flow_data.get_coord(idx_sample)
        
            for idx_subsample in range(num_subsample):
                # get
                idx = idx_sample + idx_subsample
                mat1 = sample[indices[idx]]
                mat2 = coord[indices[idx]]
            
                # store
                lst_label.append(label)
                lst_matrix.append(mat1)
                lst_coord.append(mat2)
        flow_data_new = Data_Wrapper(lst_matrix, lst_label, lst_coord)
        return flow_data_new

In [87]:
pipe = Pipeline([
    ("Scaler", Transform_subsampling(num_subsample = 3, num_subsample_size = 4))
]) # end Pipeline

In [88]:
pipe.fit(flow_data)
flow_data_new = pipe.transform(flow_data)

In [90]:
flow_data

Number of samples:            2
Number of variables:          3
Number of total observations: 7
Label 0 : 1
Label 1 : 1

In [89]:
flow_data_new

Number of samples:            6
Number of variables:          3
Number of total observations: 24
Label 0 : 3
Label 1 : 3

### Try do it manually

In [83]:
lst_label  = []
lst_matrix = []
lst_coord  = []

for k in range(flow_data.get_num_sample()):
    label  = flow_data.get_label(k)
    sample = flow_data.get_sample(k)
    coord  = flow_data.get_coord(k)
    for i in range(num_subsample):
        idx = k + i
        mat1 = sample[indices[idx]]
        mat2 = coord[indices[idx]]
        lst_label.append(label)
        lst_matrix.append(mat1)
        lst_coord.append(mat2)
print(lst_label)
print("=================")
for mat in lst_matrix:
    print(mat)
print("=================")
for mat in lst_coord:
    print(mat)

[0, 0, 0, 1, 1, 1]
[[ 0.32674818  0.61081051  0.82888398]
 [ 0.32674818  0.61081051  0.82888398]
 [ 0.32674818  0.61081051  0.82888398]
 [ 1.35783517 -1.14786721 -0.68554454]]
[[ 0.32674818  0.61081051  0.82888398]
 [ 1.35783517 -1.14786721 -0.68554454]
 [ 1.35783517 -1.14786721 -0.68554454]
 [ 0.32674818  0.61081051  0.82888398]]
[[ 1.35783517 -1.14786721 -0.68554454]
 [ 0.32674818  0.61081051  0.82888398]
 [ 0.32674818  0.61081051  0.82888398]
 [ 0.32674818  0.61081051  0.82888398]]
[[ 0.912753    0.53297061 -1.73570579]
 [ 0.04449691 -0.5109881   0.71315925]
 [ 0.04449691 -0.5109881   0.71315925]
 [ 0.912753    0.53297061 -1.73570579]]
[[ 0.04449691 -0.5109881   0.71315925]
 [ 0.912753    0.53297061 -1.73570579]
 [ 0.912753    0.53297061 -1.73570579]
 [ 0.912753    0.53297061 -1.73570579]]
[[ 0.32142442 -0.85487087  1.84871604]
 [ 0.32142442 -0.85487087  1.84871604]
 [ 0.912753    0.53297061 -1.73570579]
 [-0.31872737  0.20856326 -0.85830712]]
[[1 1]
 [1 1]
 [1 1]
 [0 0]]
[[1 1]
 [0

In [84]:
Data_Wrapper(lst_matrix, lst_label, lst_coord)

Number of samples:            6
Number of variables:          3
Number of total observations: 24
Label 0 : 3
Label 1 : 3