In [120]:
import numpy as np
from numpy import random

import copy
from toolz import curry

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline      import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.base          import BaseEstimator, TransformerMixin

import matplotlib.pyplot as plt

In [155]:
N_SAMPLE = 2
N_MARKER = 3
N_ROW    = [4, 5]

matrix = np.array([random.randn(nrow, N_MARKER) for nrow in N_ROW])
label  = random.randint(low = 0, high = 2, size = N_SAMPLE)

print("# Labels: ", label.shape)
print("# Samples:", matrix.shape)
print("===========================")
print("Sample 01:", matrix[0].shape)
print("Sample 02:", matrix[1].shape)
print("===========================")
print("Samples:\n"); print(matrix)

# Labels:  (2,)
# Samples: (2,)
Sample 01: (4, 3)
Sample 02: (5, 3)
Samples:

[array([[-0.3635574 ,  0.42167195, -0.41792284],
       [-0.5740922 , -0.57360284, -0.39873352],
       [ 0.29392975, -0.3097114 , -0.37262199],
       [ 0.24550633,  1.53278596, -0.19830045]])
 array([[ 0.84823766, -1.11282407,  1.11572724],
       [ 1.67877408,  0.66214016, -1.56027171],
       [-1.73480624, -0.83291109, -1.65652   ],
       [-0.11422291, -1.05040769,  0.7165132 ],
       [-1.4656596 , -0.32292487,  0.65447603]])]


In [138]:
class Data_Wrapper:
    """ A wrapper with data matrix (n, M_i, p) and data labels
    N   = number of samples
    M_i = number of observations / data points in the ith sample
    p   = number of variables for all sample
    
    matrix
    label
    coord
    """
    def __init__(self, matrix, label, coord = None):
        
        idx = [x.shape[0] for x in matrix]
        idx = np.r_[0, idx]
        idx = np.cumsum(idx)
        
        self.sample_idx = idx
        self.matrix = np.vstack(matrix)
        self.label  = label
        self.coord  = coord
        
    def get_num_sample(self):
        return self.label.shape[0]
    
    def get_num_variable(self):
        return self.matrix.shape[1]
    
    def get_sample(self, k):
        return self.matrix[self.sample_idx[k] : self.sample_idx[k + 1]] 
    
    def get_label(self, k):
        return self.label[k]
    
    def get_coord(self, k):
        return self.coord[self.sample_idx[k] : self.sample_idx[k + 1]] 
    
    def set_matrix(self, matrix):
        assert self.matrix.shape == matrix.shape
        self.matrix = matrix
    
    def set_coord(self, coord):
        # Check to make sure coord has the same number of samples
        assert self.matrix.shape[0] == coord.shape[0]
        self.coord = coord
    #def copy(self):
    #    self_copy = Data_wrapper(copy.copy(self.matrix), copy.copy(self.label))
    #    return self_copy

In [158]:
np.arange(10).shape[0]

10

In [130]:
flow_data = Data_Wrapper(matrix, label)

In [137]:
(1, 2) == (1, 2)

True

# Design transformer

In [145]:
class Transform_MinMaxScaler(BaseEstimator, TransformerMixin):
    def __init__(self, k = 0):
        self.x_min = 0
        self.x_max = 1
        self.k     = k
        
    def fit(self, dat, y = None):
        x_selected = dat.get_sample(self.k)
        self.x_min = np.apply_along_axis(np.min, 0, x_selected)
        self.x_max = np.apply_along_axis(np.max, 0, x_selected)
        return self

    def transform(self, dat):
        dat_copy = copy.deepcopy(dat)
        dat_copy.set_matrix((dat.matrix - self.x_min) / (self.x_max - self.x_min))
        return dat_copy

try to transform by myself

In [163]:
flow_data = Data_Wrapper(matrix, label)
x_selected = flow_data.get_sample(0)
print(flow_data.matrix)
print("======================================")
print(x_selected)
print("======================================")
x_min = np.apply_along_axis(np.min, 0, x_selected)
x_max = np.apply_along_axis(np.max, 0, x_selected)
print((flow_data.matrix - x_min) / (x_max - x_min))

[[-0.3635574   0.42167195 -0.41792284]
 [-0.5740922  -0.57360284 -0.39873352]
 [ 0.29392975 -0.3097114  -0.37262199]
 [ 0.24550633  1.53278596 -0.19830045]
 [ 0.84823766 -1.11282407  1.11572724]
 [ 1.67877408  0.66214016 -1.56027171]
 [-1.73480624 -0.83291109 -1.65652   ]
 [-0.11422291 -1.05040769  0.7165132 ]
 [-1.4656596  -0.32292487  0.65447603]]
[[-0.3635574   0.42167195 -0.41792284]
 [-0.5740922  -0.57360284 -0.39873352]
 [ 0.29392975 -0.3097114  -0.37262199]
 [ 0.24550633  1.53278596 -0.19830045]]
[[ 0.24254548  0.47250289  0.        ]
 [ 0.          0.          0.08737418]
 [ 1.          0.12528145  0.20626698]
 [ 0.94421406  1.          1.        ]
 [ 1.63858745 -0.25599321  6.98312252]
 [ 2.59540244  0.58666425 -5.20142255]
 [-1.33719436 -0.12310559 -5.63966693]
 [ 0.52978993 -0.22636127  5.16539331]
 [-1.02712542  0.1190084   4.88292134]]


try transform using transfomer

In [149]:
pipe = Pipeline([
    ("Scaler", Transform_MinMaxScaler())
]) # end Pipeline

pipe.fit(flow_data)
tmp = pipe.transform(flow_data)

In [164]:
print(flow_data.matrix)
print("======================================")
print(tmp.matrix)

[[-0.3635574   0.42167195 -0.41792284]
 [-0.5740922  -0.57360284 -0.39873352]
 [ 0.29392975 -0.3097114  -0.37262199]
 [ 0.24550633  1.53278596 -0.19830045]
 [ 0.84823766 -1.11282407  1.11572724]
 [ 1.67877408  0.66214016 -1.56027171]
 [-1.73480624 -0.83291109 -1.65652   ]
 [-0.11422291 -1.05040769  0.7165132 ]
 [-1.4656596  -0.32292487  0.65447603]]
[[ 0.          1.          0.69591158]
 [ 0.4406594   0.          0.        ]
 [ 1.          0.96257408  0.30737942]
 [ 0.32450718  0.58541438  1.        ]
 [ 0.09921739  0.59040545 -0.04112325]
 [ 0.78066093  0.06979367  0.33845644]
 [ 0.67998849 -0.26704964 -0.5599621 ]
 [ 0.38211138  0.12318218 -0.6056701 ]
 [ 1.00530039  0.61730762  0.15311992]]


-----

# Dimensional reduction

### transform manually

In [200]:
pca = PCA(n_components = 2)
pca.fit(flow_data.matrix)
pca.transform(flow_data.matrix)

array([[ 0.22871517,  0.42414595],
       [-0.49750018,  0.27718279],
       [ 0.2906834 , -0.16380167],
       [ 1.23421408,  0.18831937],
       [-0.27124459, -1.85063388],
       [ 2.2906546 ,  0.24098829],
       [-1.06953362,  1.85533047],
       [-0.81471208, -0.98037242],
       [-1.39127678,  0.0088411 ]])

### try transformer

In [204]:
class Transform_PCA(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, dat, y = None):
        self.pca = PCA(n_components = 2)
        self.pca.fit(dat.matrix)          
        return self

    def transform(self, dat):
        return self.pca.transform(dat.matrix)

In [205]:
pipe_DimRed = Pipeline([
    #("Scaler", Transform_MinMaxScaler()), 
    ("PCA",    Transform_PCA())
]) # end Pipeline

In [206]:
pipe_DimRed.fit(flow_data)
tmp = pipe_DimRed.transform(flow_data)
print(tmp)

[[ 0.22871517  0.42414595]
 [-0.49750018  0.27718279]
 [ 0.2906834  -0.16380167]
 [ 1.23421408  0.18831937]
 [-0.27124459 -1.85063388]
 [ 2.2906546   0.24098829]
 [-1.06953362  1.85533047]
 [-0.81471208 -0.98037242]
 [-1.39127678  0.0088411 ]]


# Combine both

### Transform manually

In [209]:
flow_data = Data_Wrapper(matrix, label)
x_selected = flow_data.get_sample(0)
print(flow_data.matrix)
print("======================================")
print(x_selected)
print("======================================")
x_min = np.apply_along_axis(np.min, 0, x_selected)
x_max = np.apply_along_axis(np.max, 0, x_selected)
x_scaled = (flow_data.matrix - x_min) / (x_max - x_min)
print(x_scaled)
print("======================================")
pca = PCA(n_components = 2)
print(pca.fit_transform(x_scaled))

[[-0.3635574   0.42167195 -0.41792284]
 [-0.5740922  -0.57360284 -0.39873352]
 [ 0.29392975 -0.3097114  -0.37262199]
 [ 0.24550633  1.53278596 -0.19830045]
 [ 0.84823766 -1.11282407  1.11572724]
 [ 1.67877408  0.66214016 -1.56027171]
 [-1.73480624 -0.83291109 -1.65652   ]
 [-0.11422291 -1.05040769  0.7165132 ]
 [-1.4656596  -0.32292487  0.65447603]]
[[-0.3635574   0.42167195 -0.41792284]
 [-0.5740922  -0.57360284 -0.39873352]
 [ 0.29392975 -0.3097114  -0.37262199]
 [ 0.24550633  1.53278596 -0.19830045]]
[[ 0.24254548  0.47250289  0.        ]
 [ 0.          0.          0.08737418]
 [ 1.          0.12528145  0.20626698]
 [ 0.94421406  1.          1.        ]
 [ 1.63858745 -0.25599321  6.98312252]
 [ 2.59540244  0.58666425 -5.20142255]
 [-1.33719436 -0.12310559 -5.63966693]
 [ 0.52978993 -0.22636127  5.16539331]
 [-1.02712542  0.1190084   4.88292134]]
[[ 0.84157851 -0.23093253]
 [ 0.73919201 -0.52924395]
 [ 0.62106682  0.47864328]
 [-0.14254927  0.53044978]
 [-6.16699239  1.07012341]
 [ 6

### Transform using pipeline

In [207]:
pipe_DimRed = Pipeline([
    ("Scaler", Transform_MinMaxScaler()), 
    ("PCA",    Transform_PCA())
]) # end Pipeline

In [208]:
pipe_DimRed.fit(flow_data)
tmp = pipe_DimRed.transform(flow_data)
print(tmp)

[[ 0.84157851 -0.23093253]
 [ 0.73919201 -0.52924395]
 [ 0.62106682  0.47864328]
 [-0.14254927  0.53044978]
 [-6.16699239  1.07012341]
 [ 6.03557873  2.11508552]
 [ 6.4635085  -1.87498853]
 [-4.34540409 -0.02789831]
 [-4.04597883 -1.53123867]]


In [31]:
@curry
def subsample(x, nrow = 6):
    """sub sampling data points, coord of dimensional reduction and the label of sample"""
    n_sample = x.shape[0]
    idx = random.permutation(n_sample)[:nrow]
    return x[idx]

In [32]:
pipe_subsample = Pipeline([
    ("subsample", FunctionTransformer(subsample(nrow = 2)))
])

In [33]:
print(X)

[[[ 0.17238212  0.39151234 -0.54390176 -1.0096922   0.8740321 ]
  [-0.58338941  1.1658701  -0.07259138 -2.20195907  0.9870215 ]
  [ 0.39534672  0.45950065  2.39209856  1.1825191  -0.7163661 ]
  [ 0.61646448 -0.34695849 -0.15823722 -0.22296149 -0.7697378 ]
  [-0.46238923  0.25289381  0.40916255  1.05299205 -0.03077782]]

 [[ 0.10658001 -0.17137064 -0.99267222 -0.54834638 -0.63653958]
  [-1.46728337 -0.38473923  0.03043906  0.76245502  0.19391931]
  [-1.19811674  0.50462747  0.86662192  1.69378961 -1.21865115]
  [-0.75676848 -0.27360749  0.40084885 -0.70385175  0.89916968]
  [-1.64055038  0.30439335  0.77566228 -1.71747428  0.11914506]]]


In [34]:
pipe_subsample.fit_transform(X[0])

array([[ 0.39534672,  0.45950065,  2.39209856,  1.1825191 , -0.7163661 ],
       [ 0.17238212,  0.39151234, -0.54390176, -1.0096922 ,  0.8740321 ]])