In [20]:
import matplotlib.pyplot as plt # plotting
import pandas as pd # data manipulation and analysis
import numpy as np # numerical computation
import pickle

import scipy
from scipy.interpolate import spline
from scipy.ndimage.filters import gaussian_filter1d
from statsmodels.nonparametric.smoothers_lowess import lowess
import random
import math

### load the data from pickled files

In [2]:
f = open("partitioned_features.pickle","rb")
feat_all = pickle.load(f)
f.close()

f = open("partitioned_features_defective.pickle","rb")
feat_defective = pickle.load(f)
f.close()

f = open("partitioned_features_good.pickle","rb")
feat_good = pickle.load(f)
f.close()

### quality check 1 - check if all data in good is labeled w/ 0 and all data w/ defective is labeled w/ 1

They passed the test :) 

In [3]:
for test in feat_defective:
    for pt in test:
        label = pt[-2]
        if label==0:
            print("Error")
            print(pt)
        elif label!=1:
            print("Error")

In [4]:
for test in feat_good:
    for pt in test:
        label = pt[-2]
        if label==1:
            print("Error")
            print(pt)
        elif label!=0:
            print("Error")

## Calculate all the means and statndard deviation

The methods below take as input a list of features - this is the data, normalised and chopped. The output is some feature of the time series, for instance it could be the mean, the standard deviation, the max, the min, the norm gradient mean etc. 

In [28]:
def the_means(feature_list):
    feat_means = []
    for i in feature_list:
        transposed = np.transpose(i)
        means = []
        for j in transposed:
            means.append(np.mean(j))
        feat_means.append(means)
    return feat_means

def the_stds(feature_list):
    feat_stds = []
    for i in feature_list:
        transposed = np.transpose(i)
        stds = []
        for j in transposed:
            stds.append(np.std(j))
        feat_stds.append(stds)
    return feat_stds

def the_max(feature_list):
    feat_max = []
    for i in feature_list:
        transposed = np.transpose(i)
        maxes = []
        for j in transposed:
            maxes.append(max(j))
        feat_max.append(maxes)
    return feat_max

def the_min(feature_list):
    feat_min = []
    for i in feature_list:
        transposed = np.transpose(i)
        mins = []
        for j in transposed:
            mins.append(min(j))
        feat_min.append(mins)
        
def get_gradient(feature_list):# returns the gradient of everything
    feat_grad = []
    for i in feature_list:
        transposed = np.transpose(i)
        # find the gradient at each point
        gradients = []
        for j in range(len(transposed)):
            grad_feat = []
            for k in range(len(transposed[j])):
                if k>=1:
                    grad_feat.append(transposed[j][k]-transposed[j][k-1])
            gradients.append(grad_feat)
        feat_grad.append(np.transpose(gradients))
    return feat_grad
        
def non_zero_grad_mean45(feature_list):# 2d
    feature_grad = get_gradient(feature_list)
    non_zero_grad_means = []
    for i in feature_grad:
        transposed = np.transpose(i)
        non_zero_grad = [[],[]]
        for j in [4,5]:
            for k in range(len(transposed[j])):
                if transposed[j][k]!=0:
                    non_zero_grad[j-4].append(transposed[j][k])
        non_zero_grad_means.append([np.mean(map(abs,non_zero_grad[0])),np.mean(map(abs,non_zero_grad[1]))])
    return non_zero_grad_means

def gradient_norm_mean(feature_list):
    feat_grad = get_gradient(feature_list)
        
    # we want a gradient (absolute value) mean as a feature
    feat_grad = map(abs,feat_grad)
    gradient_mean = the_means(feat_grad)
    
    return gradient_mean

def double_derivative_norm_mean(feature_list):# 10 D
    double_derivative = get_gradient(get_gradient(feature_list))
    
    # we want the absolute value mean of the double derivative
    double_derivative = map(abs,double_derivative)
    double_derivative_mean = the_means(double_derivative)
    
    return double_derivative_mean

# returns the input smoothed
#def get_smoothed(feature_list):


# something we noticed is that features 4, 5 and 6 are often 50 for many data points in a row, in other
# words the gradient is 0 quite alot. We therefore add two new features which are 
# 1) the number of data points where the gradient is 0 divided by the number of times the gradient is zero
# we do this for numbers 4, 5 and 6
def grad_new_features(feature_list):
    feature_grad = get_gradient(feature_list)
    new_features_list = []
    for i in feature_grad:
        transposed = np.transpose(i)
        new_feature = []
        for j in [4,5,6]:
            count=0
            for k in range(len(transposed[j])):
                if transposed[j][k]==0: count+=1
            try: new_feature.append(count/len(transposed[j]))
            except ZeroDivisionError: new_feature.append(count/1)
        new_features_list.append(new_feature)
    return new_features_list
        



#feat_all_grad_means = gradient_mean(feat_all)

#feat_all_means = the_means(feat_all)

new_features = grad_new_features(feat_all)
dd = double_derivative_norm_mean(feat_all)

"""
feat_all_means = the_means(feat_all)
feat_defective_means = the_means(feat_defective)
feat_good_means = the_means(feat_good)

feat_all_stds = the_stds(feat_all)
feat_defective_stds = the_means(feat_defective)
feat_good_stds = the_means(feat_good)

feat_all_max = the_max(feat_all)
feat_defective_max = the_max(feat_defective)
feat_good_max = the_max(feat_good)

feat_all_min = the_min(feat_all)
feat_defective_min = the_min(feat_defective)
feat_good_min = the_min(feat_defective)"""

'\nfeat_all_means = the_means(feat_all)\nfeat_defective_means = the_means(feat_defective)\nfeat_good_means = the_means(feat_good)\n\nfeat_all_stds = the_stds(feat_all)\nfeat_defective_stds = the_means(feat_defective)\nfeat_good_stds = the_means(feat_good)\n\nfeat_all_max = the_max(feat_all)\nfeat_defective_max = the_max(feat_defective)\nfeat_good_max = the_max(feat_good)\n\nfeat_all_min = the_min(feat_all)\nfeat_defective_min = the_min(feat_defective)\nfeat_good_min = the_min(feat_defective)'

In [12]:
new_features[400]

[0.6387987012987013, 0.48538961038961037, 0.24918831168831168]

#### function takes viet's normalised time series list and returns a feature matrix

In [13]:
def get_features_map(time_series):
    # means
    feat_means = the_means(time_series)# 10 d
    # stds
    feat_stds = the_stds(time_series)# 10 d
    # maxes
    feat_max = the_max(time_series)# 10 d
    # min
    feat_min = the_min(time_series)# 10 d
    # gradient mean
    feat_grad = gradient_mean_plus(time_series)# 10 d
    # new features from 4,5,6
    new_feat1 = grad_new_features(time_series)# 3 d
    
    # fusing the features
    



In [17]:
#print(feat_all_grad_means[100])

In [18]:
print(len(feat_all_stds))
print(len(feat_defective_stds))
print(len(feat_good_stds))
feat_all_stds[0]

470
166
304


[156.37173345474025,
 62.32149366866028,
 1.232999140272644,
 5.025814823312249,
 5.389884403292671,
 14.612273862705484,
 36.434891883195824,
 3.09237379284704,
 2.266874622624454,
 19.058937700983474,
 0.0,
 0.0]

## Find the gradient of each feature
We discretize the data, taking the data from Viet's one minuit data I will find the gradient at 15 equidistant points and use these as 15 features.

## Plot them

In [10]:
# input is a feature index

def disp_feature_mean_etc(i):
    fam = [j[i] for j in feat_all_means]
    fas = [j[i] for j in feat_all_stds]
    fdm = [j[i] for j in feat_defective_means]
    fgm = [j[i] for j in feat_good_means]
    fds = [j[i] for j in feat_defective_stds]
    fgs = [j[i] for j in feat_good_stds]
    
    plt.figure(figsize=(12,12))

    t=0.8

    plt.subplot(431)
    plt.plot(np.linspace(0,1,len(fam)), fam,"x",alpha=t)
    plt.title("feat_all_means")

    plt.subplot(432)
    plt.plot(np.linspace(0,1,len(fdm)), fdm,"x",alpha=t)
    plt.title("feat_defective_means")


    plt.subplot(433)
    plt.plot(np.linspace(0,1,len(fgm)), fgm,"x",alpha=t)
    plt.title("feat_good_means")



    plt.subplot(434)
    plt.plot(np.linspace(0,1,len(fas)), fas,"x",alpha=t)
    plt.title("feat_all_stds")


    plt.subplot(435)
    plt.plot(np.linspace(0,1,len(fds)), fds,"x",alpha=t)
    plt.title("feat_defective_stds")


    plt.subplot(436)
    plt.plot(np.linspace(0,1,len(fgs)), fgs,"x",alpha=t)
    plt.title("feat_good_stds")

    plt.show()
    
    
disp_feature_mean_etc(0)

NameError: name 'feat_all_means' is not defined