In [1]:
from sklearn.model_selection import GridSearchCV
import numpy as np

X = np.load('data/X_train.npy')#[0:load_n]
y = np.genfromtxt('data/y_1.csv', delimiter='\n')#[0:load_n]
#data_test = np.load('data/X_test.npy')


In [12]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.base import BaseEstimator, TransformerMixin
import random
random.seed(99)

class GivenBinHistExtraction(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_new = []
        X_new = X
        return X_new

class ClusteredHistExtraction(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, n_samples=3, images_x_from=False, images_x_to=False):
        self.n_clusters = n_clusters
        self.n_samples = n_samples
        self.images_x_from = images_x_from
        self.images_x_to = images_x_to


    def cutImage(self, x):
        if self.images_x_from is not False and self.images_x_to is not False:
            #images = np.split(row, 176)[50:130] # pretty optimal already
            side_images = np.split(x, 176)[self.images_x_from : self.images_x_to]
            x = np.array(side_images).flatten()
        return x


    def fit(self, X, y=None):
        samples = random.sample(list(X), self.n_samples)
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_jobs=-1, random_state=42)
        # self.kmeans = MiniBatchKMeans(n_clusters=self.n_clusters, batch_size=100, random_state=42)

        centers = []
        for i, sample in enumerate(samples):
            # sample = sample[1672390 : -786303]
            sample = self.cutImage(sample)

            sample = sample[(sample > 0) & (sample < 1800)]
            self.kmeans.fit(np.array([sample]).T)
            centers.append(np.sort(np.array(self.kmeans.cluster_centers_).flatten()))
            
            samples[i] = sample
            #print(str(i) + ' done')

        if True: # use all centers
            values = np.array(centers).flatten()
            values = np.sort(values)
        else: # take means of centers
            values = np.mean(centers, axis=0)

        # compute cluster centers
        #self.kmeans.fit(np.array(samples).T)
        #values = self.kmeans.cluster_centers_.T
        #print('fitted')

        # mean of the clusters over the rows
        #for i, v in enumerate(values.T):
        #    values.T[i] = np.sort(v)

        #values = np.mean(values.T, axis=0)

        self.edges = [1] # leave out 0
        for center_1, center_2 in zip(values[:-1], values[1:]):
            self.edges.append(.5 * (center_1 + center_2))

        print('n edges: ' + str(len(self.edges)))
        return self

    def transform(self, X, y=None):
        # np.histogram to make bins from edges, counts the number of pixels
        X_new = []
        for x in X:
            x = self.cutImage(x)
            x = x[(x > 0) & (x < 1800)]
            hist = np.histogram(x, bins=self.edges)
            X_new.append(hist[0])

        return X_new
    
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.kernel_ridge import KernelRidge

pipe = Pipeline([
    ('ClusteredHistExtraction', ClusteredHistExtraction(
        n_clusters=8,n_samples=3)),
    ('scaler', StandardScaler()),
    ('vct', VarianceThreshold()),
    ('kernelRidge', KernelRidge(kernel='polynomial'))
])

In [13]:
from sklearn.model_selection import GridSearchCV

parameters = {
 #   'ClusteredHistExtraction__n_clusters': [8, 10],
 #   'ClusteredHistExtraction__n_samples': [3, 4]
}
grid = GridSearchCV(pipe, parameters, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
grid.fit(X, y)
print(str(np.mean(grid.cv_results_['mean_test_score'])) + ', ' + str(np.mean(grid.cv_results_['std_test_score'])))

# -68.8529277579, 5.72349220142
# -68.5395679288, 6.39975328753

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


0 done
0 done
0 done
0 done
1 done
1 done
1 done
1 done
2 done
n edges: 24
2 done
n edges: 24
2 done
n edges: 24
2 done
3 done
n edges: 32


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


0 done
0 done
0 done


  **self._backend_args)


1 done
1 done
1 done
2 done
0 done
2 done
3 done
n edges: 32
2 done
n edges: 30
3 done
n edges: 32
1 done
2 done
n edges: 30


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


0 done
0 done
0 done


  **self._backend_args)


1 done
1 done
1 done
0 done
2 done
2 done
n edges: 30
2 done
1 done
3 done
n edges: 40
3 done
n edges: 40
2 done
3 done
n edges: 40
0 done
1 done
2 done
n edges: 24
-68.5395679288, 6.39975328753


In [17]:
grid.cv_results_
#grid.best_params_

{'ClusteredHistExtraction__n_clusters': 8,
 'ClusteredHistExtraction__n_samples': 3}

In [20]:
# Search bin subdivision so that there is highest variance
# 1. Divide hist into bins over some rows
# 2. Compute std -> divide bins where it's high, join/drop those where it's low
# 3. Repeat / change rows
kmeans = KMeans(n_clusters=8, n_jobs=-1, random_state=42)

samples = X[0: 5]
centers = []
for i, sample in enumerate(samples):
    # sample = sample[1672390 : -786303]

    sample = sample[(sample > 0) & (sample < 1800)]
    kmeans.fit(np.array([sample]).T)
    centers.append(np.sort(np.array(kmeans.cluster_centers_).flatten()))

    #samples[i] = sample
    #print(str(i) + ' done')

if True: # use all centers
    values = np.array(centers).flatten()
    values = np.sort(values)
else: # take means of centers
    values = np.mean(centers, axis=0)

edges = [1] # leave out 0
for center_1, center_2 in zip(values[:-1], values[1:]):
    edges.append(.5 * (center_1 + center_2))

print('n edges: ' + str(len(edges)))
edges

n edges: 40


[1,
 268.59153353677925,
 281.00824805433393,
 296.51672876785585,
 312.41929109857142,
 385.23831822211014,
 458.12983660320458,
 472.33076634277745,
 493.82267048266544,
 510.43647791541645,
 576.57399025095651,
 642.540938664245,
 648.54789979548752,
 655.08349269468147,
 666.54241844275282,
 724.33201508885827,
 779.89739389945362,
 783.20843221048995,
 785.82438478516588,
 807.58109023044335,
 869.72060524581752,
 913.23145834367801,
 915.91949110352323,
 919.63925603546056,
 950.7720301599602,
 1021.6951866283912,
 1069.5507893466106,
 1076.2284155454829,
 1078.4194846165499,
 1107.9840749772698,
 1178.3353557602009,
 1228.4902326692686,
 1237.3030234970834,
 1243.0805123468485,
 1277.8951157600625,
 1339.3753602258789,
 1372.8570616285097,
 1379.1746465921135,
 1388.0591113102987,
 1433.3192436369491]

In [111]:
from sklearn.utils import resample

edges = [1,
 180,
 190.0,
 200,
 210,
 220,
 230.0,
 240,
 250.29576676838963,
 260.59153353677925,
 270.7998907955566,
 281.00824805433393,
 296.51672876785585,
 312.4192910985714,
 328.7096455492857,
 345,
 365.11915911105507,
 385.23831822211014,
 421.68407741265736,
 458.1298366032046,
 510.43647791541645,
 576.5739902509565,
 609.5574644576008,
 626.0492015609229,
 642.540938664245,
 655.0834926946815,
 666.5424184427528,
 680,
 688.7915018861073,
 697.5830037722146,
 705.1660075444291,
 712.7490113166436,
 720.3320150888583,
 725.2490113166436,
 730.1660075444291,
 740,
 747.7243484748634,
 755.4486969497268,
 763.1730454245902,
 770.8973938994536,
 778.3608893423097,
 785.8243847851659,
 791.9121923925829,
 798,
 807.5810902304434,
 816.5985296073651,
 825.6159689842868,
 834.6334083612087,
 843.6508477381304,
 861.685726491974,
 879.7206052458175,
 896.4760317947478,
 913.231458343678,
 932.0017442518191,
 950.7720301599602,
 968.5028192770679,
 986.2336083941757,
 1021.6951866283912,
 1045.622987987501,
 1069.5507893466106,
 1123.9430725534057,
 1151.1392141568033,
 1178.3353557602009,
 1203.4127942147347,
 1228.4902326692686,
 1240.8414534419671,
 1253.1926742146657,
 1265.5438949873642,
 1277.8951157600625,
 1288.9475578800311,
 1300,
 1309.8438400564696,
 1319.6876801129395,
 1329.5315201694093,
 1339.375360225879,
 1347.7457855765365,
 1356.1162109271943,
 1364.486636277852,
 1372.8570616285097,
 1386.4285308142548,
 1400,
 1408.3298109092373,
 1416.6596218184745,
 1424.9894327277118,
 1433.319243636949,
 1442,
 1448.5,
 1455,
 1461.25,
 1467.5,
 1480,
 1490.0,
 1500,
 1507.5,
 1518.75,
 1530,
 1540.0,
 1550,
 1562.5,
 1581.25,
 1600,
 1700.0]
hists = []
for x in X:#resample(X, n_samples=100):
    hists.append(np.histogram(x, bins=edges)[0])
    
std_all = np.std(np.array(hists).flatten())
for bin_i in range(len(hists[0])):
    arr = [h[bin_i] for h in hists]
    print("bin "+str(bin_i)+": "+str(np.std(arr) / std_all))

bin 0: 0.432416827304
bin 1: 0.249416486019
bin 2: 0.31463743285
bin 3: 0.366658647262
bin 4: 0.39503619799
bin 5: 0.401361403221
bin 6: 0.389141122756
bin 7: 0.39806967435
bin 8: 0.330676242495
bin 9: 0.299774958406
bin 10: 0.298434488639
bin 11: 0.362260792968
bin 12: 0.343216562371
bin 13: 0.308164375208
bin 14: 0.28113782381
bin 15: 0.337882936238
bin 16: 0.295491869878
bin 17: 0.476272940267
bin 18: 0.416666366306
bin 19: 0.462663522257
bin 20: 0.408123630071
bin 21: 0.329913329401
bin 22: 0.262019639979
bin 23: 0.321467068133
bin 24: 0.32276716912
bin 25: 0.320869284076
bin 26: 0.434892082432
bin 27: 0.336236718272
bin 28: 0.364994550459
bin 29: 0.345562031504
bin 30: 0.314977524139
bin 31: 0.374574090713
bin 32: 0.239785822359
bin 33: 0.244273622663
bin 34: 0.444547203866
bin 35: 0.398475964687
bin 36: 0.397899319829
bin 37: 0.390488504746
bin 38: 0.333516727571
bin 39: 0.370108395392
bin 40: 0.310941590569
bin 41: 0.256224975377
bin 42: 0.243806240822
bin 43: 0.380531974248
bin

In [110]:
from IPython.display import display
new_edges = edges # attention: 49 bins = 50 edges
for i in range(1):
    # compute hists for samples
    hists = []
    for x in X:
        hists.append(np.histogram(x, bins=new_edges)[0])

    ed = [1]
    for bin_i in range(len(hists[0])):
        # compute std for bin i
        arr = [h[bin_i] for h in hists]
        std = np.std(arr) / std_all
        
        if std > 0.45: # if std over threshold, split bin
            ed.append(0.5 * (new_edges[bin_i] + new_edges[bin_i+1])) 
            
        if std < 0.2: # if std too low, join with next bin
            continue
            
        ed.append(new_edges[bin_i+1])
        
    new_edges = ed
    if len(new_edges)>=120: break
        
print(len(new_edges))
display(new_edges)

102


[1,
 180,
 190.0,
 200,
 210,
 220,
 230.0,
 240,
 250.29576676838963,
 260.59153353677925,
 270.7998907955566,
 281.00824805433393,
 296.51672876785585,
 312.4192910985714,
 328.7096455492857,
 345,
 365.11915911105507,
 385.23831822211014,
 421.68407741265736,
 458.1298366032046,
 510.43647791541645,
 576.5739902509565,
 609.5574644576008,
 626.0492015609229,
 642.540938664245,
 655.0834926946815,
 666.5424184427528,
 680,
 688.7915018861073,
 697.5830037722146,
 705.1660075444291,
 712.7490113166436,
 720.3320150888583,
 725.2490113166436,
 730.1660075444291,
 740,
 747.7243484748634,
 755.4486969497268,
 763.1730454245902,
 770.8973938994536,
 778.3608893423097,
 785.8243847851659,
 791.9121923925829,
 798,
 807.5810902304434,
 816.5985296073651,
 825.6159689842868,
 834.6334083612087,
 843.6508477381304,
 861.685726491974,
 879.7206052458175,
 896.4760317947478,
 913.231458343678,
 932.0017442518191,
 950.7720301599602,
 968.5028192770679,
 986.2336083941757,
 1021.6951866283912,
