In [1]:
from sklearn.model_selection import GridSearchCV
import numpy as np

X = np.load('data/X_train.npy')
y = np.genfromtxt('data/train_labels_new.csv', delimiter='\n')
#data_test = np.load('data/X_test.npy')

In [9]:
len(X)
#len(X[0])/610

290

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
import random
random.seed(99)

class RandomBinsExtraction(BaseEstimator, TransformerMixin):
    """Build n bins with mean from values"""
    def __init__(self, splits=610, hist_bins=None,
        images_x_from=False, images_x_to=False,
        images_y_from=False, images_y_to=False):

        self.splits = splits
        self.hist_bins = hist_bins

        self.images_x_from = images_x_from
        self.images_x_to = images_x_to
        self.images_y_from = images_y_from
        self.images_y_to = images_y_to
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_new = []
        if self.hist_bins is None:
            self.hist_bins = [[1, 282.10434686113894, 528.4350826042349, 635.7632261805744, 781.9301580581496, 962.0317275933281, 1079.2939329789033, 1246.3707282050862, 1393.0345691835053, 1721.8292294917992]]
            self.hist_bins = [[1.0, 232.65961378075005, 558.8191901033158, 611.01554100226133, 751.8506948050117, 1052.1025371443798, 1062.2386813212379, 1170.4154624315881, 1436.3987799127171, 1697.5334609065319], [1.0, 186.72353571884148, 499.56891844496221, 624.19256420970828, 730.19039613570453, 998.01457289512143, 1056.5030972543479, 1258.4466689014348, 1369.7502123351373, 1758.2213785228221], [1.0, 221.68156101446493, 581.49667265889093, 614.50991456293559, 761.96240623309268, 980.92869221152398, 1058.6177837244034, 1174.5526614755495, 1395.8783306073485, 1802.5528211949884], [1.0, 186.44657897736735, 524.53480338126076, 680.06149110022682, 825.71120465575359, 1007.2216895885749, 1073.5680922216211, 1179.6545741163768, 1385.8007474622702, 1755.4559466479525], [1.0, 278.30678764984856, 496.3651807461319, 622.25955800991676, 751.08708003863421, 1036.8689696848187, 1122.3469092586117, 1154.0519784850339, 1336.4426643909171, 1704.2387442838437]]
            self.hist_bins = [[1.0, 232.65961378075005, 565.81853397045609, 611.01554100226133, 751.8506948050117, 1052.1025371443798, 1059.0148160033814, 1170.4154624315881, 1436.3987799127171, 1697.5334609065319], [1.0, 186.72353571884148, 508.50609978611561, 624.19256420970828, 730.19039613570453, 998.58889623300081, 1056.5030972543479, 1258.4466689014348, 1369.7502123351373, 1758.2213785228221], [1.0, 221.68156101446493, 586.98614392870741, 614.50991456293559, 761.96240623309268, 980.92869221152398, 1058.6177837244034, 1173.9334424281788, 1395.8783306073485, 1802.5528211949884], [1.0, 186.44657897736735, 524.53480338126076, 680.06149110022682, 825.71120465575359, 1002.6096738879295, 1073.5680922216211, 1177.7070287135298, 1385.8007474622702, 1755.4559466479525], [1.0, 278.30678764984856, 496.3651807461319, 616.33484624047469, 741.23947591665205, 1036.8689696848187, 1122.3469092586117, 1154.0519784850339, 1336.4426643909171, 1704.2387442838437]]

        first = True
        for row in X:
            # This is feature selection 
            if self.images_x_from is not False and self.images_x_to is not False:
                images = np.split(row, 176)[self.images_x_from : self.images_x_to]
                
                # x needs to be set for this, but don't mind at the moment
                if self.images_y_from is not False and self.images_y_to is not False:
                    images_new = []
                    for image in images:
                        images_new.append(np.split(image, 208)[self.images_y_from : self.images_y_to])
                    images = np.array(images_new)

                row = np.array(images).flatten()
               
                
            splits = np.array_split(row, int(self.splits))
            if first:
                #print("splits: "+str(len(splits)))
                first = False
                
            features = []
            for j, split in enumerate(splits):
                i = int(j / len(splits) * len(self.hist_bins))
                
                features.append(np.histogram(split, bins=self.hist_bins[i])[0])

            #print(len(features))
            X_new.append(np.array(features).flatten())

        return X_new
    
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingRegressor
from sklearn.decomposition import TruncatedSVD

pipe = Pipeline([
    ('BinsExtraction', RandomBinsExtraction(splits=10500,
        images_x_from=False, images_x_to=False, images_y_from=False, images_y_to=False)),
  #      ('vct', VarianceThreshold(threshold=10.0)),
    ('scaler', StandardScaler()),
    ('svd', TruncatedSVD(n_components=10000)),
    #('linreg', LinearRegression(n_jobs=-1))
    ('linearSVR', LinearSVC(C=1.0, max_iter=1000))
])

In [None]:
from sklearn.metrics import f1_score, roc_auc_score
pipe.fit(X[0:190], y[0:190])
print('fitted')
y_pr = pipe.predict(X[190:])
print(str(f1_score(y[190:], y_pr)) + " " + str(roc_auc_score(y[190:], y_pr)))