In [1]:
from sklearn.model_selection import GridSearchCV
import numpy as np

X = np.load('data/X_train.npy')#[0:load_n]
y = np.genfromtxt('data/y_1.csv', delimiter='\n')#[0:load_n]
#data_test = np.load('data/X_test.npy')


In [12]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.base import BaseEstimator, TransformerMixin
import random
random.seed(99)

class GivenBinHistExtraction(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_new = []
        X_new = X
        return X_new

class ClusteredHistExtraction(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, n_samples=3, images_x_from=False, images_x_to=False):
        self.n_clusters = n_clusters
        self.n_samples = n_samples
        self.images_x_from = images_x_from
        self.images_x_to = images_x_to


    def cutImage(self, x):
        if self.images_x_from is not False and self.images_x_to is not False:
            #images = np.split(row, 176)[50:130] # pretty optimal already
            side_images = np.split(x, 176)[self.images_x_from : self.images_x_to]
            x = np.array(side_images).flatten()
        return x


    def fit(self, X, y=None):
        samples = random.sample(list(X), self.n_samples)
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_jobs=-1, random_state=42)
        # self.kmeans = MiniBatchKMeans(n_clusters=self.n_clusters, batch_size=100, random_state=42)

        centers = []
        for i, sample in enumerate(samples):
            # sample = sample[1672390 : -786303]
            sample = self.cutImage(sample)

            sample = sample[(sample > 0) & (sample < 1800)]
            self.kmeans.fit(np.array([sample]).T)
            centers.append(np.sort(np.array(self.kmeans.cluster_centers_).flatten()))
            
            samples[i] = sample
            #print(str(i) + ' done')

        if True: # use all centers
            values = np.array(centers).flatten()
            values = np.sort(values)
        else: # take means of centers
            values = np.mean(centers, axis=0)

        # compute cluster centers
        #self.kmeans.fit(np.array(samples).T)
        #values = self.kmeans.cluster_centers_.T
        #print('fitted')

        # mean of the clusters over the rows
        #for i, v in enumerate(values.T):
        #    values.T[i] = np.sort(v)

        #values = np.mean(values.T, axis=0)

        self.edges = [1] # leave out 0
        for center_1, center_2 in zip(values[:-1], values[1:]):
            self.edges.append(.5 * (center_1 + center_2))

        print('n edges: ' + str(len(self.edges)))
        return self

    def transform(self, X, y=None):
        # np.histogram to make bins from edges, counts the number of pixels
        X_new = []
        for x in X:
            x = self.cutImage(x)
            x = x[(x > 0) & (x < 1800)]
            hist = np.histogram(x, bins=self.edges)
            X_new.append(hist[0])

        return X_new
    
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.kernel_ridge import KernelRidge

pipe = Pipeline([
    ('ClusteredHistExtraction', ClusteredHistExtraction(
        n_clusters=8,n_samples=3)),
    ('scaler', StandardScaler()),
    ('vct', VarianceThreshold()),
    ('kernelRidge', KernelRidge(kernel='polynomial'))
])

In [13]:
from sklearn.model_selection import GridSearchCV

parameters = {
 #   'ClusteredHistExtraction__n_clusters': [8, 10],
 #   'ClusteredHistExtraction__n_samples': [3, 4]
}
grid = GridSearchCV(pipe, parameters, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
grid.fit(X, y)
print(str(np.mean(grid.cv_results_['mean_test_score'])) + ', ' + str(np.mean(grid.cv_results_['std_test_score'])))

# -68.8529277579, 5.72349220142
# -68.5395679288, 6.39975328753

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


0 done
0 done
0 done
0 done
1 done
1 done
1 done
1 done
2 done
n edges: 24
2 done
n edges: 24
2 done
n edges: 24
2 done
3 done
n edges: 32


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


0 done
0 done
0 done


  **self._backend_args)


1 done
1 done
1 done
2 done
0 done
2 done
3 done
n edges: 32
2 done
n edges: 30
3 done
n edges: 32
1 done
2 done
n edges: 30


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


0 done
0 done
0 done


  **self._backend_args)


1 done
1 done
1 done
0 done
2 done
2 done
n edges: 30
2 done
1 done
3 done
n edges: 40
3 done
n edges: 40
2 done
3 done
n edges: 40
0 done
1 done
2 done
n edges: 24
-68.5395679288, 6.39975328753


In [17]:
grid.cv_results_
#grid.best_params_

{'ClusteredHistExtraction__n_clusters': 8,
 'ClusteredHistExtraction__n_samples': 3}

In [20]:
# Search bin subdivision so that there is highest variance
# 1. Divide hist into bins over some rows
# 2. Compute std -> divide bins where it's high, join/drop those where it's low
# 3. Repeat / change rows
kmeans = KMeans(n_clusters=8, n_jobs=-1, random_state=42)

samples = X[0: 5]
centers = []
for i, sample in enumerate(samples):
    # sample = sample[1672390 : -786303]

    sample = sample[(sample > 0) & (sample < 1800)]
    kmeans.fit(np.array([sample]).T)
    centers.append(np.sort(np.array(kmeans.cluster_centers_).flatten()))

    #samples[i] = sample
    #print(str(i) + ' done')

if True: # use all centers
    values = np.array(centers).flatten()
    values = np.sort(values)
else: # take means of centers
    values = np.mean(centers, axis=0)

edges = [1] # leave out 0
for center_1, center_2 in zip(values[:-1], values[1:]):
    edges.append(.5 * (center_1 + center_2))

print('n edges: ' + str(len(edges)))
edges

n edges: 40


[1,
 268.59153353677925,
 281.00824805433393,
 296.51672876785585,
 312.41929109857142,
 385.23831822211014,
 458.12983660320458,
 472.33076634277745,
 493.82267048266544,
 510.43647791541645,
 576.57399025095651,
 642.540938664245,
 648.54789979548752,
 655.08349269468147,
 666.54241844275282,
 724.33201508885827,
 779.89739389945362,
 783.20843221048995,
 785.82438478516588,
 807.58109023044335,
 869.72060524581752,
 913.23145834367801,
 915.91949110352323,
 919.63925603546056,
 950.7720301599602,
 1021.6951866283912,
 1069.5507893466106,
 1076.2284155454829,
 1078.4194846165499,
 1107.9840749772698,
 1178.3353557602009,
 1228.4902326692686,
 1237.3030234970834,
 1243.0805123468485,
 1277.8951157600625,
 1339.3753602258789,
 1372.8570616285097,
 1379.1746465921135,
 1388.0591113102987,
 1433.3192436369491]

In [78]:
from sklearn.utils import resample

edges = [1,
 180,
 200,
 210,
 220,
 240,
 260.59153353677925,
 281.00824805433393,
 296.51672876785585,
 312.41929109857142,
 345,
 385.23831822211014,
 458.12983660320458,
 510.43647791541645,
 576.57399025095651,
 642.540938664245,
 655.08349269468147,
 666.54241844275282,
 680,
 685,
 690,
 720.33201508885827,
 740,     
 770.89739389945362,
 785.82438478516588,
 798,
 807.58109023044335,
 879.72060524581752,
 913.23145834367801,
 919.63925603546056,
 950.7720301599602,
 1021.6951866283912,
 1069.5507893466106,
 1178.3353557602009,
 1228.4902326692686,
 1277.8951157600625,
 1300,
 1339.3753602258789,
 1372.8570616285097,
 1400,
 1433.3192436369491,
 1442,
 1455,
 1480, 
 1500,
 1515,
 1530,
 1550,
 1600,
 1900]
hists = []
for x in X[0:150]:#resample(X, n_samples=100):
    hists.append(np.histogram(x, bins=edges)[0])
    
std_all = np.std(np.array(hists).flatten())
for bin_i in range(len(hists[0])):
    arr = [h[bin_i] for h in hists]
    print("bin "+str(bin_i)+": "+str(np.std(arr) / std_all))

bin 0: 0.172722206437
bin 1: 0.23312530879
bin 2: 0.150905081133
bin 3: 0.160753959364
bin 4: 0.313312563315
bin 5: 0.280834634592
bin 6: 0.226567436498
bin 7: 0.137058635949
bin 8: 0.129096060692
bin 9: 0.220507130653
bin 10: 0.234812990625
bin 11: 0.327948669144
bin 12: 0.169871067886
bin 13: 0.152224813638
bin 14: 0.346048687099
bin 15: 0.122551778655
bin 16: 0.121748725652
bin 17: 0.165047581807
bin 18: 0.0693534887643
bin 19: 0.0727251173441
bin 20: 0.514021815953
bin 21: 0.351443520218
bin 22: 0.575374638965
bin 23: 0.259015271678
bin 24: 0.190281774556
bin 25: 0.145408985362
bin 26: 0.650786826204
bin 27: 0.183632898274
bin 28: 0.0321617041621
bin 29: 0.167939157798
bin 30: 0.372691279269
bin 31: 0.201402221332
bin 32: 0.361354803437
bin 33: 0.267460054896
bin 34: 0.41137711023
bin 35: 0.258482363057
bin 36: 0.545991040779
bin 37: 0.420282773395
bin 38: 0.328274113124
bin 39: 0.488794438849
bin 40: 0.127566184829
bin 41: 0.208742353766
bin 42: 0.387528427912
bin 43: 0.2822515520

In [79]:
from IPython.display import display
new_edges = edges # attention 49 bins = 50 edges
for i in range(1):
    hists = []
    for x in X[0:100]:#resample(X, n_samples=100):
        hists.append(np.histogram(x, bins=edges)[0])

    #stds = []
    for bin_i in range(len(hists[0])):
        arr = [h[bin_i] for h in hists]
        #stds.append(np.std(arr) / std_all)
        std = np.std(arr) / std_all
        
    # add edges where std higher than threshold
    #for bin_i in 
    display(new_edges)

50


[1,
 180,
 200,
 210,
 220,
 240,
 260.59153353677925,
 281.00824805433393,
 296.51672876785585,
 312.4192910985714,
 345,
 385.23831822211014,
 458.1298366032046,
 510.43647791541645,
 576.5739902509565,
 642.540938664245,
 655.0834926946815,
 666.5424184427528,
 680,
 685,
 690,
 720.3320150888583,
 740,
 770.8973938994536,
 785.8243847851659,
 798,
 807.5810902304434,
 879.7206052458175,
 913.231458343678,
 919.6392560354606,
 950.7720301599602,
 1021.6951866283912,
 1069.5507893466106,
 1178.3353557602009,
 1228.4902326692686,
 1277.8951157600625,
 1300,
 1339.375360225879,
 1372.8570616285097,
 1400,
 1433.319243636949,
 1442,
 1455,
 1480,
 1500,
 1515,
 1530,
 1550,
 1600,
 1900]