# Set Environment

In [120]:
import numpy as np
from numpy import random

import copy
from toolz import curry

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline      import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.base          import BaseEstimator, TransformerMixin

import matplotlib.pyplot as plt

### Initializing a test data

In [210]:
N_SAMPLE = 2
N_MARKER = 3
N_ROW    = [4, 5]

matrix = np.array([random.randn(nrow, N_MARKER) for nrow in N_ROW])
label  = random.randint(low = 0, high = 2, size = N_SAMPLE)

print("# Labels: ", label.shape)
print("# Samples:", matrix.shape)
print("===========================")
print("Sample 01:", matrix[0].shape)
print("Sample 02:", matrix[1].shape)
print("===========================")
print("Samples:\n"); print(matrix)

# Labels:  (2,)
# Samples: (2,)
Sample 01: (4, 3)
Sample 02: (5, 3)
Samples:

[array([[ 1.72486301, -1.74161635, -1.60443276],
       [ 0.47096821,  1.07675167, -0.58467308],
       [-0.85483657, -0.57482064, -0.60647885],
       [ 1.24821705, -1.66500948, -1.0301062 ]])
 array([[ 1.24460359,  1.29236521, -0.07777386],
       [ 1.94157103,  0.53071106,  2.0030456 ],
       [ 0.136027  ,  1.72157823,  0.33852869],
       [ 1.19233839,  0.64058474,  0.33085767],
       [-0.32593714, -0.38823698, -0.47930752]])]


In [217]:
class Data_Wrapper:
    """ A wrapper with data matrix (N, M_i, p) and data labels
    N   = number of samples
    M_i = number of observations / data points in the ith sample
    p   = number of variables for all sample
    Args:
        matrix (N, M_i, p): data values
        label  (N,):        labels
        coord  (Sum(M_i)):  
    """
    def __init__(self, matrix, label, coord = None):
        
        idx = [x.shape[0] for x in matrix]
        idx = np.r_[0, idx]
        idx = np.cumsum(idx)
        
        self.sample_idx = idx
        self.matrix = np.vstack(matrix)
        self.label  = label
        self.coord  = coord
        
    def get_num_sample(self):
        return self.label.shape[0]
    
    def get_num_variable(self):
        return self.matrix.shape[1]
    
    def get_sample(self, k):
        return self.matrix[self.sample_idx[k] : self.sample_idx[k + 1]] 
    
    def get_label(self, k):
        return self.label[k]
    
    def get_coord(self, k):
        return self.coord[self.sample_idx[k] : self.sample_idx[k + 1]] 
    
    def set_matrix(self, matrix):
        assert self.matrix.shape == matrix.shape
        self.matrix = matrix
    
    def set_coord(self, coord):
        # Check to make sure coord has the same number of samples
        assert self.matrix.shape[0] == coord.shape[0]
        self.coord = coord
    #def copy(self):
    #    self_copy = Data_wrapper(copy.copy(self.matrix), copy.copy(self.label))
    #    return self_copy

In [218]:
flow_data = Data_Wrapper(matrix, label)

# Design transformer

In [219]:
class Transform_MinMaxScaler(BaseEstimator, TransformerMixin):
    def __init__(self, k = 0):
        self.x_min = 0
        self.x_max = 1
        self.k     = k
        
    def fit(self, dat, y = None):
        x_selected = dat.get_sample(self.k)
        self.x_min = np.apply_along_axis(np.min, 0, x_selected)
        self.x_max = np.apply_along_axis(np.max, 0, x_selected)
        return self

    def transform(self, dat):
        dat_copy = copy.deepcopy(dat)
        dat_copy.set_matrix((dat.matrix - self.x_min) / (self.x_max - self.x_min))
        return dat_copy

try to transform by myself

In [220]:
flow_data = Data_Wrapper(matrix, label)
x_selected = flow_data.get_sample(0)
print(flow_data.matrix)
print("======================================")
print(x_selected)
print("======================================")
x_min = np.apply_along_axis(np.min, 0, x_selected)
x_max = np.apply_along_axis(np.max, 0, x_selected)
print((flow_data.matrix - x_min) / (x_max - x_min))

[[ 1.72486301 -1.74161635 -1.60443276]
 [ 0.47096821  1.07675167 -0.58467308]
 [-0.85483657 -0.57482064 -0.60647885]
 [ 1.24821705 -1.66500948 -1.0301062 ]
 [ 1.24460359  1.29236521 -0.07777386]
 [ 1.94157103  0.53071106  2.0030456 ]
 [ 0.136027    1.72157823  0.33852869]
 [ 1.19233839  0.64058474  0.33085767]
 [-0.32593714 -0.38823698 -0.47930752]]
[[ 1.72486301 -1.74161635 -1.60443276]
 [ 0.47096821  1.07675167 -0.58467308]
 [-0.85483657 -0.57482064 -0.60647885]
 [ 1.24821705 -1.66500948 -1.0301062 ]]
[[1.         0.         0.        ]
 [0.51393767 1.         1.        ]
 [0.         0.41399693 0.97861675]
 [0.81523199 0.02718129 0.56319795]
 [0.81383126 1.07650297 1.49707713]
 [1.08400514 0.80625646 3.53757698]
 [0.38410037 1.22879431 1.90531308]
 [0.79357107 0.84524131 1.89779069]
 [0.20502365 0.48019966 1.10332391]]


try transform using transfomer

In [221]:
pipe = Pipeline([
    ("Scaler", Transform_MinMaxScaler())
]) # end Pipeline

pipe.fit(flow_data)
tmp = pipe.transform(flow_data)

In [222]:
print(flow_data.matrix)
print("======================================")
print(tmp.matrix)

[[ 1.72486301 -1.74161635 -1.60443276]
 [ 0.47096821  1.07675167 -0.58467308]
 [-0.85483657 -0.57482064 -0.60647885]
 [ 1.24821705 -1.66500948 -1.0301062 ]
 [ 1.24460359  1.29236521 -0.07777386]
 [ 1.94157103  0.53071106  2.0030456 ]
 [ 0.136027    1.72157823  0.33852869]
 [ 1.19233839  0.64058474  0.33085767]
 [-0.32593714 -0.38823698 -0.47930752]]
[[1.         0.         0.        ]
 [0.51393767 1.         1.        ]
 [0.         0.41399693 0.97861675]
 [0.81523199 0.02718129 0.56319795]
 [0.81383126 1.07650297 1.49707713]
 [1.08400514 0.80625646 3.53757698]
 [0.38410037 1.22879431 1.90531308]
 [0.79357107 0.84524131 1.89779069]
 [0.20502365 0.48019966 1.10332391]]


-----

# Dimensional reduction

In [223]:
class Transform_PCA(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, dat, y = None):
        self.pca = PCA(n_components = 2)
        self.pca.fit(dat.matrix)          
        return self

    def transform(self, dat):
        dat_copy = copy.deepcopy(dat)
        dat_copy.set_coord(pca.transform(dat.matrix))
        return dat_copy

### transform manually

In [224]:
pca = PCA(n_components = 2)
pca.fit(flow_data.matrix)
pca.transform(flow_data.matrix)

array([[ 2.29070887,  0.91217267],
       [-0.54835753, -0.65795579],
       [ 0.83746131, -1.39978021],
       [ 1.90462973,  0.65434963],
       [-1.0441931 ,  0.14849587],
       [-1.67616529,  1.6851859 ],
       [-1.60301413, -0.82916529],
       [-0.75718735,  0.41957691],
       [ 0.59611749, -0.93287969]])

### try transformer

In [227]:
pipe_DimRed = Pipeline([
    ("PCA",    Transform_PCA())
]) # end Pipeline

pipe_DimRed.fit(flow_data)
tmp = pipe_DimRed.transform(flow_data)

In [229]:
print(flow_data.matrix)
print("======================================")
print(tmp.coord)

[[ 1.72486301 -1.74161635 -1.60443276]
 [ 0.47096821  1.07675167 -0.58467308]
 [-0.85483657 -0.57482064 -0.60647885]
 [ 1.24821705 -1.66500948 -1.0301062 ]
 [ 1.24460359  1.29236521 -0.07777386]
 [ 1.94157103  0.53071106  2.0030456 ]
 [ 0.136027    1.72157823  0.33852869]
 [ 1.19233839  0.64058474  0.33085767]
 [-0.32593714 -0.38823698 -0.47930752]]
[[ 2.29070887  0.91217267]
 [-0.54835753 -0.65795579]
 [ 0.83746131 -1.39978021]
 [ 1.90462973  0.65434963]
 [-1.0441931   0.14849587]
 [-1.67616529  1.6851859 ]
 [-1.60301413 -0.82916529]
 [-0.75718735  0.41957691]
 [ 0.59611749 -0.93287969]]


# Combine both

### Transform and reduce manually

In [230]:
flow_data = Data_Wrapper(matrix, label)
x_selected = flow_data.get_sample(0)

print("data matrix")
print(flow_data.matrix)
print("======================================")
print("select one sample")
print(x_selected)
print("======================================")
print("scaled data matrix")
x_min = np.apply_along_axis(np.min, 0, x_selected)
x_max = np.apply_along_axis(np.max, 0, x_selected)
x_scaled = (flow_data.matrix - x_min) / (x_max - x_min)
print(x_scaled)
print("======================================")
print("Dimension reduction")
pca = PCA(n_components = 2)
print(pca.fit_transform(x_scaled))

data matrix
[[ 1.72486301 -1.74161635 -1.60443276]
 [ 0.47096821  1.07675167 -0.58467308]
 [-0.85483657 -0.57482064 -0.60647885]
 [ 1.24821705 -1.66500948 -1.0301062 ]
 [ 1.24460359  1.29236521 -0.07777386]
 [ 1.94157103  0.53071106  2.0030456 ]
 [ 0.136027    1.72157823  0.33852869]
 [ 1.19233839  0.64058474  0.33085767]
 [-0.32593714 -0.38823698 -0.47930752]]
select one sample
[[ 1.72486301 -1.74161635 -1.60443276]
 [ 0.47096821  1.07675167 -0.58467308]
 [-0.85483657 -0.57482064 -0.60647885]
 [ 1.24821705 -1.66500948 -1.0301062 ]]
scaled data matrix
[[1.         0.         0.        ]
 [0.51393767 1.         1.        ]
 [0.         0.41399693 0.97861675]
 [0.81523199 0.02718129 0.56319795]
 [0.81383126 1.07650297 1.49707713]
 [1.08400514 0.80625646 3.53757698]
 [0.38410037 1.22879431 1.90531308]
 [0.79357107 0.84524131 1.89779069]
 [0.20502365 0.48019966 1.10332391]]
Dimension reduction
[[-1.48046602  0.52700593]
 [-0.27866885 -0.35311761]
 [-0.5093928  -0.37322502]
 [-0.95026639  0

### Transform using pipeline

In [232]:
pipe = Pipeline([
    ("Scaler", Transform_MinMaxScaler()), 
    ("PCA",    Transform_PCA())
]) # end Pipeline

pipe.fit(flow_data)
tmp = pipe.transform(flow_data)

<__main__.Data_Wrapper object at 0x7f32a810c518>


In [233]:
print(flow_data.matrix)
print("======================================")
print(tmp.matrix)
print("======================================")
print(tmp.coord)

[[ 1.72486301 -1.74161635 -1.60443276]
 [ 0.47096821  1.07675167 -0.58467308]
 [-0.85483657 -0.57482064 -0.60647885]
 [ 1.24821705 -1.66500948 -1.0301062 ]
 [ 1.24460359  1.29236521 -0.07777386]
 [ 1.94157103  0.53071106  2.0030456 ]
 [ 0.136027    1.72157823  0.33852869]
 [ 1.19233839  0.64058474  0.33085767]
 [-0.32593714 -0.38823698 -0.47930752]]
[[1.         0.         0.        ]
 [0.51393767 1.         1.        ]
 [0.         0.41399693 0.97861675]
 [0.81523199 0.02718129 0.56319795]
 [0.81383126 1.07650297 1.49707713]
 [1.08400514 0.80625646 3.53757698]
 [0.38410037 1.22879431 1.90531308]
 [0.79357107 0.84524131 1.89779069]
 [0.20502365 0.48019966 1.10332391]]
[[-1.48046602  0.52700593]
 [-0.27866885 -0.35311761]
 [-0.5093928  -0.37322502]
 [-0.95026639  0.44047915]
 [ 0.24222947 -0.11094225]
 [ 2.13413552  0.52487341]
 [ 0.64042898 -0.48303633]
 [ 0.55659479  0.07170336]
 [-0.3545947  -0.24374064]]
