In [2]:
from sklearn.model_selection import GridSearchCV
import numpy as np

X = np.load('data/X_train.npy')#[0:load_n]
y = np.genfromtxt('data/y_1.csv', delimiter='\n')#[0:load_n]
#data_test = np.load('data/X_test.npy')


In [125]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.base import BaseEstimator, TransformerMixin
import random
random.seed(99)

class BinsExtraction(BaseEstimator, TransformerMixin):
    """Build n bins with mean from values"""
    def __init__(self, bin_length=3, splits=1000, hist_bins=None,
        images_x_from=False, images_x_to=False,
        images_y_from=False, images_y_to=False):
        #self.bin_length = bin_length
        self.splits = splits
        self.hist_bins = hist_bins

        self.images_x_from = images_x_from
        self.images_x_to = images_x_to
        self.images_y_from = images_y_from
        self.images_y_to = images_y_to

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_new = []
        split_x = 10
        split_y = 10
        if self.hist_bins is None:
            self.hist_bins = [1,  316.06548606,   516.04062473,   672.03912953,   788.4017923 ,
              912.68082233,  1063.06793716,  1221.11834124,  1370.37759073, 1700]#,
           #        100, 284.2603614 ,   461.84852282,   647.97452156,   826.76038816,
         # 980.3224361 ,  1135.55237028,  1308.37312973,  1473.53302531])
    
        first = True
        for row in X:
            # use only without "RemoveEmptyValues"
            # This is feature selection actually
            if self.images_x_from is not False and self.images_x_to is not False:
                #images = np.split(row, 176)[50:130] # pretty optimal already
                images = np.split(row, 176)[self.images_x_from : self.images_x_to]
                
                # x needs to be set for this, but don't mind at the moment
                #if self.images_y_from is not False and self.images_y_to is not False:
                #    images_new = []
                #    for image in images:
                #        images_new.append(np.split(image, 208)[self.images_y_from : self.images_y_to])
                #    images = np.array(images_new)

                row = np.array(images).flatten()
                #features = []
                #for image in images:
                #    for split in np.array_split(image, 104):  
                #        features.append(np.histogram(split, bins=hist_bins, density=False)[0])

            splits = np.array_split(row, int(len(row) / self.splits))
            features = []
            for split in splits:
                features.append(np.histogram(split, bins=self.hist_bins)[0])

            X_new.append(np.array(features).flatten())
            if first:
                print("features: " + str(len(X_new[0])))
                first = False

        return X_new
    
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR

pipe = Pipeline([
    ('BinsExtraction', BinsExtraction(
        bin_length=50, images_x_from=50, images_x_to=130)),
    ('scaler', StandardScaler()),
    ('vct', VarianceThreshold(threshold=0.1)),
    #('linreg', LinearRegression(n_jobs=-1))
    #('kernelRidge', KernelRidge(kernel='polynomial'))
    ('linearSVR', LinearSVR(C=10.0))
])

In [126]:
"""
from sklearn.model_selection import GridSearchCV
parameters = {
   'BinsExtraction__bin_length': [50],
}
grid = GridSearchCV(pipe, parameters, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
grid.fit(X, y)
print(str(np.mean(grid.cv_results_['mean_test_score'])) + ', ' + str(np.mean(grid.cv_results_['std_test_score'])))
"""
from sklearn.metrics import mean_squared_error
pipe.fit(X[0:150], y[0:150])
y_pr = pipe.predict(X[151:228])
print(mean_squared_error(y[151:228], y_pr))

# 52.5900323473 var0.05 bins=10
# 51.6259478991 better bins 41k
# 51.3320775803 added 1-edges, 47k, /500
# 51.6 /300, 78k
# 52.2 /220, 106k
# 55.5 /64, 366k
# 51.1 linearsvr, /500, 53k
# 51.6 /510
# 51.5 /490
# 90.9 /250
# 51.7 /1000
# 52.0 add random edge, 58k
# 51.7 remove last edge, 47k
# 52.1 /125, 187k
# 52.8160909792 more better bins 99k
# 67.7           kernelridge
# 51.5455457395 much more bins 94k
# 51.5211591462 less bins 31k
# 51.6344647532 more bins 67k
# 56            more better bins 87k features
# 55.5 other splits2, 41k
# 52.6655238717 other splits, 28k features
# 56.237862326  14k features
# 52.1237605257  56k features

import os
os.system('say "i have finished!"')

features: 26352
features: 26352
51.7321020515


0

In [20]:
grid.cv_results_
#grid.best_params_

{'mean_fit_time': array([ 147.92655778]),
 'mean_score_time': array([ 21.32155935]),
 'mean_test_score': array([-96.58154818]),
 'mean_train_score': array([ -1.85495607e-26]),
 'param_BinsExtraction__bin_length': masked_array(data = [50],
              mask = [False],
        fill_value = ?),
 'params': [{'BinsExtraction__bin_length': 50}],
 'rank_test_score': array([1], dtype=int32),
 'split0_test_score': array([-90.6712437]),
 'split0_train_score': array([ -1.73421817e-26]),
 'split1_test_score': array([-104.79753217]),
 'split1_train_score': array([ -1.48858479e-26]),
 'split2_test_score': array([-94.25080693]),
 'split2_train_score': array([ -2.34206526e-26]),
 'std_fit_time': array([ 2.88726258]),
 'std_score_time': array([ 0.10417926]),
 'std_test_score': array([ 6.00543734]),
 'std_train_score': array([  3.58738956e-27])}

In [4]:
# Search bin subdivision so that there is highest variance
# 1. Divide hist into bins over some rows
# 2. Compute std -> divide bins where it's high, join/drop those where it's low
# 3. Repeat / change rows
kmeans = KMeans(n_clusters=8, n_jobs=-1, random_state=42)

samples = X[0: 5]
centers = []
for i, sample in enumerate(samples):
    # sample = sample[1672390 : -786303]

    sample = sample[(sample > 0) & (sample < 1800)]
    kmeans.fit(np.array([sample]).T)
    centers.append(np.sort(np.array(kmeans.cluster_centers_).flatten()))

    #samples[i] = sample
    #print(str(i) + ' done')

if True: # use all centers
    values = np.array(centers).flatten()
    values = np.sort(values)
else: # take means of centers
    values = np.mean(centers, axis=0)

edges = [1] # leave out 0
for center_1, center_2 in zip(values[:-1], values[1:]):
    edges.append(.5 * (center_1 + center_2))

print('n edges: ' + str(len(edges)))
edges

n edges: 40


[array([  316.06548606,   516.04062473,   672.03912953,   788.4017923 ,
          912.68082233,  1063.06793716,  1221.11834124,  1370.37759073]),
 array([  308.77309614,   504.8323311 ,   661.04570736,   783.16988715,
          921.22162422,  1080.41577968,  1247.41710179,  1393.10546197]),
 array([  259.42693236,   454.41115039,   637.10735577,   783.24697727,
          918.05688785,  1076.03364154,  1238.7439229 ,  1383.01276065]),
 array([  284.2603614 ,   461.84852282,   647.97452156,   826.76038816,
          980.3224361 ,  1135.55237028,  1308.37312973,  1473.53302531]),
 array([  277.75613471,   482.81300987,   649.12127803,   776.62490065,
          913.78209436,  1076.42318956,  1235.8621241 ,  1375.33653253])]

In [None]:
from sklearn.utils import resample

edges = [1,
 175,
 190.0,
 200,
 210,
 220,
 230.0,
 240,
 250.29576676838963,
 260.59153353677925,
 270.7998907955566,
 281.00824805433393,
 296.51672876785585,
 312.4192910985714,
 328.7096455492857,
 345,
 365.11915911105507,
 385.23831822211014,
 421.68407741265736,
 458.1298366032046,
 517.3519134270805,
 576.5739902509565,
 609.5574644576008,
 626.0492015609229,
 642.540938664245,
 655.0834926946815,
 666.5424184427528,
 680,
 688.7915018861073,
 697.5830037722146,
 705.1660075444291,
 712.7490113166436,
 720.3320150888583,
 730.1660075444291,
 740,
 747.7243484748634,
 755.4486969497268,
 763.1730454245902,
 770.8973938994536,
 778.3608893423097,
 785.8243847851659,
 792.7855130483395,
 799.7466413115131,
 807.5810902304434,
 816.5985296073651,
 825.6159689842868,
 836.8877682054391,
 848.1595674265914,
 863.9400863362044,
 879.7206052458175,
 905.8611747488183,
 932.0017442518191,
 950.2522817644435,
 968.5028192770679,
 995.0990029527295,
 1021.6951866283912,
 1045.622987987501,
 1084.7830302704533,
 1123.9430725534057,
 1178.3353557602009,
 1203.4127942147347,
 1228.4902326692686,
 1240.8414534419671,
 1253.1926742146657,
 1265.5438949873642,
 1277.8951157600625,
 1288.9475578800311,
 1300,
 1309.8438400564696,
 1319.6876801129395,
 1329.5315201694093,
 1339.375360225879,
 1347.7457855765365,
 1356.1162109271943,
 1365.7868972366239,
 1375.4575835460535,
 1386.4285308142548,
 1400,
 1408.3298109092373,
 1416.6596218184745,
 1424.9894327277118,
 1433.319243636949,
 1442,
 1448.5,
 1461.25,
 1470.625,
 1480,
 1490.0,
 1500,
 1509.375,
 1518.75,
 1530,
 1546.25,
 1562.5,
 1581.25,
 1610,
 1700.0]
hists = []
for x in X:#resample(X, n_samples=100):
    hists.append(np.histogram(x, bins=edges)[0])
    
std_all = np.std(np.array(hists).flatten())
for bin_i in range(len(hists[0])):
    arr = [h[bin_i] for h in hists]
    print("bin "+str(bin_i)+": "+str(np.std(arr) / std_all))

In [None]:
from IPython.display import display
new_edges = edges # attention: 49 bins = 50 edges
for i in range(1):
    # compute hists for samples
    hists = []
    for x in X:
        hists.append(np.histogram(x, bins=new_edges)[0])

    ed = [1]
    for bin_i in range(len(hists[0])):
        # compute std for bin i
        arr = [h[bin_i] for h in hists]
        std = np.std(arr) / std_all
        
        if std > 0.35: # if std over threshold, split bin
            ed.append(0.5 * (new_edges[bin_i] + new_edges[bin_i+1])) 
            
        if std < 0.2: # if std too low, join with next bin
            continue
            
        ed.append(new_edges[bin_i+1])
        
    new_edges = ed
    if len(new_edges)>=120: break
        
print(len(new_edges))
display(new_edges)

In [131]:
import random
def run(splits, hist_bins):
    class BinsExtraction(BaseEstimator, TransformerMixin):
        """Build n bins with mean from values"""
        def __init__(self, bin_length=3, splits=1000, hist_bins=None,
            images_x_from=False, images_x_to=False,
            images_y_from=False, images_y_to=False):
            #self.bin_length = bin_length
            self.splits = splits
            self.hist_bins = hist_bins

            self.images_x_from = images_x_from
            self.images_x_to = images_x_to
            self.images_y_from = images_y_from
            self.images_y_to = images_y_to

        def fit(self, X, y=None):
            return self

        def transform(self, X, y=None):
            X_new = []
            split_x = 10
            split_y = 10
            if self.hist_bins is None:
                self.hist_bins = [1,  316.06548606,   516.04062473,   672.03912953,   788.4017923 ,
                  912.68082233,  1063.06793716,  1221.11834124,  1370.37759073, 1700]#,
               #        100, 284.2603614 ,   461.84852282,   647.97452156,   826.76038816,
             # 980.3224361 ,  1135.55237028,  1308.37312973,  1473.53302531])

            first = True
            for row in X:
                # use only without "RemoveEmptyValues"
                # This is feature selection actually
                if self.images_x_from is not False and self.images_x_to is not False:
                    #images = np.split(row, 176)[50:130] # pretty optimal already
                    images = np.split(row, 176)[self.images_x_from : self.images_x_to]

                    # x needs to be set for this, but don't mind at the moment
                    #if self.images_y_from is not False and self.images_y_to is not False:
                    #    images_new = []
                    #    for image in images:
                    #        images_new.append(np.split(image, 208)[self.images_y_from : self.images_y_to])
                    #    images = np.array(images_new)

                    row = np.array(images).flatten()
                    #features = []
                    #for image in images:
                    #    for split in np.array_split(image, 104):  
                    #        features.append(np.histogram(split, bins=hist_bins, density=False)[0])

                splits = np.array_split(row, int(len(row) / self.splits))
                features = []
                for split in splits:
                    features.append(np.histogram(split, bins=self.hist_bins)[0])

                X_new.append(np.array(features).flatten())
                if first:
                    #print("features: " + str(len(X_new[0])))
                    first = False

            return X_new

    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler, Normalizer
    from sklearn.feature_selection import VarianceThreshold
    from sklearn.kernel_ridge import KernelRidge
    from sklearn.linear_model import LinearRegression
    from sklearn.svm import LinearSVR

    pipe = Pipeline([
        ('BinsExtraction', BinsExtraction(splits=splits, hist_bins=hist_bins,
            bin_length=50, images_x_from=50, images_x_to=130)),
        ('scaler', StandardScaler()),
        ('vct', VarianceThreshold(threshold=0.1)),
        #('linreg', LinearRegression(n_jobs=-1))
        #('kernelRidge', KernelRidge(kernel='polynomial'))
        ('linearSVR', LinearSVR(C=10.0))
    ])

    pipe.fit(X[0:150], y[0:150])
    y_pr = pipe.predict(X[151:228])
    print("%s\t%s" % (mean_squared_error(y[151:228], y_pr), splits))
    print("\t", hist_bins)

In [133]:
some_bins = [1,  316.06548606,   516.04062473,   672.03912953,   788.4017923 ,
              912.68082233,  1063.06793716,  1221.11834124,  1370.37759073, 1700]
for i in range(300, 2000, 10):
    run(i, some_bins)
    
for i in range(0, 30):
    new_bins = [1]
    for j in range(1, len(some_bins)):
        b = some_bins[j]
        new_bins.append(random.uniform(b - 50, b + 50))
    #print(new_bins)
    run(1000, new_bins)

51.7321020515	400
	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073, 1700]
51.7321020515	410
	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073, 1700]
51.7321020515	420
	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073, 1700]
51.7321020515	430
	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073, 1700]
51.7321020515	440
	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073, 1700]
51.7321020515	450
	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073, 1700]
51.7321020515	460
	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073

	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073, 1700]
51.7321020515	1900
	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073, 1700]
51.7321020515	1910
	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073, 1700]
51.7321020515	1920
	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073, 1700]
51.7321020515	1930
	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073, 1700]
51.7321020515	1940
	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073, 1700]
51.7321020515	1950
	 [1, 316.06548606, 516.04062473, 672.03912953, 788.4017923, 912.68082233, 1063.06793716, 1221.11834124, 1370.37759073, 1700]
51.7