In [1]:
from sklearn.model_selection import GridSearchCV
import numpy as np

X = np.load('data/X_train.npy')#[0:load_n]
y = np.genfromtxt('data/y_1.csv', delimiter='\n')#[0:load_n]
#data_test = np.load('data/X_test.npy')


In [6]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.base import BaseEstimator, TransformerMixin
import random
random.seed(42)

class ClusteredHistExtraction(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, n_samples=2, images_x_from=False, images_x_to=False):
        self.n_clusters = n_clusters
        self.n_samples = n_samples
        self.images_x_from = images_x_from
        self.images_x_to = images_x_to


    def cutImage(self, x):
        if self.images_x_from is not False and self.images_x_to is not False:
            #images = np.split(row, 176)[50:130] # pretty optimal already
            side_images = np.split(x, 176)[self.images_x_from : self.images_x_to]
            x = np.array(side_images).flatten()
        return x


    def fit(self, X, y=None):
        samples = random.sample(list(X), self.n_samples)
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_jobs=-1, random_state=42)
        # self.kmeans = MiniBatchKMeans(n_clusters=self.n_clusters, batch_size=100, random_state=42)

        centers = []
        for i, sample in enumerate(samples):
            # samples[i] = sample[1672390 : -786303]
            sample = self.cutImage(sample)

            samples[i] = sample[(sample > 0) & (sample < 1800)]
            self.kmeans.fit(np.array([samples[i]]).T)
            centers.append(np.sort(np.array(self.kmeans.cluster_centers_).flatten()))
            print(str(i) + ' done')

        if True: # use all centers
            values = np.array(centers).flatten()
            values = np.sort(values)
        else: # take means of centers
            values = np.mean(centers, axis=0)


        # compute cluster centers
        #self.kmeans.fit(np.array(samples).T)
        #values = self.kmeans.cluster_centers_.T
        #print('fitted')

        # mean of the clusters over the rows
        #for i, v in enumerate(values.T):
        #    values.T[i] = np.sort(v)

        #values = np.mean(values.T, axis=0)



        self.edges = [1] # leave out 0
        for center_1, center_2 in zip(values[:-1], values[1:]):
            self.edges.append(.5 * (center_1 + center_2))

        print('n edges: ' + str(len(self.edges)))
        return self

    def transform(self, X, y=None):
        # np.histogram to make bins from edges, counts the number of pixels
        X_new = []
        for x in X:
            x = self.cutImage(x)
            hist = np.histogram(x, bins=self.edges)
            X_new.append(hist[0])

        return X_new
    
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.kernel_ridge import KernelRidge

pipe = Pipeline([
    ('ClusteredHistExtraction', ClusteredHistExtraction(n_clusters=10,n_samples=3)),
    ('scaler', StandardScaler()),
    ('vct', VarianceThreshold()),
    ('kernelRidge', KernelRidge(kernel='polynomial'))
])

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    
}
grid = GridSearchCV(pipe, parameters, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
grid.fit(X, y)
print(np.mean(grid.cv_results_['mean_test_score']))
print(np.mean(grid.cv_results_["std_test_score"]))
