In [6]:
from __future__ import division
%matplotlib inline
import csv
import sys
import os
import traceback
import json
import numpy as np
import pandas as pd
from tabulate import tabulate
from pandas.tools.plotting import lag_plot, autocorrelation_plot
from matplotlib import pyplot as plt
from collections import defaultdict, Counter
from scipy.stats import norm, skew, kurtosis
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.ar_model import AR
import scipy.fftpack as fft

from helper_functions import getListOfFiles, getCSV, getStatistics, remap_interval

## Exploring meta-features

In [7]:
csv_dir = "./data"
files = getListOfFiles(csv_dir, ".csv")
print ">> {} CSV Files found:\n".format(len(files))
print json.dumps(files)

>> 34 CSV Files found:

["_2016-11-23-18-49-13_exp1_Player.csv", "_2016-11-23-18-49-13_exp2_Player.csv", "_2016-11-23-18-49-13_exp3_Player.csv", "_2016-11-23-18-49-13_exp4_Player.csv", "_2016-11-23-18-49-13_exp5_Player.csv", "_2016-11-24-15-43-37_exp1d_Player.csv", "_2016-11-24-15-43-37_exp2d_Player.csv", "_2016-11-24-15-43-37_exp3d_Player.csv", "_2016-11-24-15-43-37_exp4d_Player.csv", "_2016-11-24-15-43-37_exp5d_Player.csv", "_2016-11-24-15-43-37_exp6d_Player.csv", "_2016-11-24-16-23-29_expa_Player.csv", "_2016-11-24-16-23-29_expb_Player.csv", "_2016-11-24-16-23-29_expc_Player.csv", "_2016-11-24-16-23-29_expd_Player.csv", "_2016-11-24-16-48-48_exp1d_Player.csv", "_2016-11-24-16-48-48_exp2d_Player.csv", "_2016-11-24-16-48-48_exp3d_Player.csv", "_2016-11-24-17-15-38_expa_Player.csv", "_2016-11-24-17-15-38_expb_Player.csv", "_2016-11-24-17-15-38_expc_Player.csv", "_2016-11-24-17-40-06_expb_Player.csv", "_2016-11-26-15-42-51_exp1d_Player.csv", "_2016-11-26-16-05-47_exp1d_Player.csv", "_20

Now, we define a dictionary containing a remap of the name of the features. This is to easy visualization since ros feature names may be very long and also allow quick decoupled modifications (just edit the variable locally). Also, from it we define a ignore list containing the names we do not want to consider for the analysis. That is done by setting to the ignore list all dictionary keys with empty values. **Here we assume the names are consistent feature name w.r.t. the `csv` data. Otherwise, a `ValueError` exception is likely to be thrown!**

In [15]:
# variable for storing the loaded feature names.
feature_name_map = {
  "time" : "time",
  "Control": "control",
  "High_level": "high_level",
  "Expectation": "expectation",
  "Activity": "activity",
  "/kinect_features/.ci": "ci",
  "/kinect_features/.distance": "distance",
  "/kinect_features/.proximity": "proximity",
  "robogame/imu_state.gyro.x": "gyroX",
  "robogame/imu_state.gyro.y": "gyroY",
  "robogame/imu_state.gyro.z": "gyroZ",
  "robogame/imu_state.linear_acc.x": "accX",
  "robogame/imu_state.linear_acc.y": "accY",
  "robogame/imu_state.linear_acc.z": "accZ"
}

ignore_col_list = [k for k,v in feature_name_map.items() if v is ""]
print "List of ignored topics: {}".format(ignore_col_list)

List of ignored topics: []


## Meta-features function definition

In [19]:
def mean(data):
    """Calculates mean of the data"""
    if isinstance(data,list):
        return [d.mean(axis=0, skipna=True, numeric_only=True) for d in data]
    else:
        return data.mean(axis=0, skipna=True)


def std(data):
    """Calculates the standard deviation"""
    if isinstance(data,list):
        return [d.std(axis=0, skipna=True, numeric_only=True) for d in data]
    else:
        return data.std(axis=0, skipna=True, numeric_only=True)


def max_value(data):
    """ Calculates Largest value in array"""
    if isinstance(data,list):
        return [d.max(axis=0, skipna=True, numeric_only=True) for d in data]
    else:
        return data.max(axis=0, skipna=True, numeric_only=True)

    
def min_value(data):
    """Calculates smallest value in array"""
    if isinstance(data,list):
        return [d.min(axis=0, skipna=True, numeric_only=True) for d in data]
    else:
        return data.min(axis=0, skipna=True, numeric_only=True)


def mad(data):
    """ Calculates the median absolute deviation"""
    if isinstance(data,list):
        l = []
        for d in data:
            m = {}
            for k in data._get_numeric_data():
                m[k] = abs(data[k].dropna() - data[k].median())
                m[k] = pd.Series(m[k]).median()
            l.append(pd.Series(m))
        return l
    else:
        m = {}
        for k in data._get_numeric_data():
            m[k] = abs(data[k].dropna() - data[k].median())
            m[k] = pd.Series(m[k]).median()
        return pd.Series(m)


def sma(data):
    """Computes Signal magnitude area.
    http://dsp.stackexchange.com/questions/18649/signal-magnitude-area
    """
    if isinstance(data,list):
        return [d.sum(axis=0, skipna=True, numeric_only=True) / data.shape[0] for d in data]
    else:
        return data.sum(axis=0, skipna=True, numeric_only=True) / data.shape[0]


def energy(data):
    """Energy measure. Sum of the squares divided by the number of values."""
    if isinstance(data,list):
        return [d.dropna().apply(lambda x: x**2).mean(axis=0, skipna=True, numeric_only=True) for d in data]
    else:
        return data.dropna().apply(lambda x: x**2).mean(axis=0, skipna=True, numeric_only=True)


def iqr(data):
    """Calculates the interquartile range
    http://stackoverflow.com/questions/23228244/how-do-you-find-the-iqr-in-numpy
    """
    if isinstance(data,np.ndarray):
        return np.subtract(*np.percentile(data, [75, 25]))
    else:
        v = {}
        for k in data._get_numeric_data():
            v[k] = np.subtract(*np.percentile(data[k].dropna(), [75, 25]))
        return pd.Series(v)

def entropy(data):
    """Signal entropy"""
    pass

def maxInds(data, n_bins=200, filterMean= True):
    """Returns the index of the frequency component with largest magnitude"""
    m_indexes = {}
    for k in data._get_numeric_data():
        c_sig = []
        if filterMean:
            filtered = data[k].dropna().as_matrix()
            mean_sig = np.ones_like(filtered)*np.mean(filtered)
            # remove mean of the signal, for better results.
            c_sig = data[k].dropna().as_matrix() - mean_sig
        freqsig = fft.fft(c_sig,n=n_bins) 
        half_freq_domain = freqsig[:int(n_bins/2)]
        #get index in the freq domain
        m_indexes[k] = np.where(np.abs(half_freq_domain)==(max(np.abs(half_freq_domain))))[0][0]
    return pd.Series(m_indexes)

def meanFreq(data, n_bins=200, filterMean=True):
    """
    Weighted average of the frequency components to obtain a mean frequency
    http://luscinia.sourceforge.net/page26/page35/page35.html
    """
    m_freq = {}
    for k in data._get_numeric_data():
        c_sig = []
        if filterMean:
            filtered = data[k].dropna().as_matrix()
            mean_sig = np.ones_like(filtered)*np.mean(filtered)
            # remove mean of the signal, for better results.
            c_sig = data[k].dropna().as_matrix() - mean_sig
        freqsig = fft.fft(c_sig,n=n_bins) 
        half_freq_domain = freqsig[:int(n_bins/2)]
        #get index in the freq domain
        m_freq[k] = np.sum(np.abs(half_freq_domain) * range(len(half_freq_domain)))/sum(np.abs(half_freq_domain))
    return pd.Series(m_freq)

def skewness(data, n_bins=200, filterMean=True): 
    """skewness of the frequency domain signal"""
    m_skew = {}
    for k in data._get_numeric_data():
        c_sig = []
        if filterMean:
            filtered = data[k].dropna().as_matrix()
            mean_sig = np.ones_like(filtered)*np.mean(filtered)
            # remove mean of the signal, for better results.
            c_sig = data[k].dropna().as_matrix() - mean_sig
        freqsig = fft.fft(c_sig,n=n_bins) 
        half_freq_domain = freqsig[:int(n_bins/2)]
        #get index in the freq domain
        m_skew[k] = skew(c_sig)
    return pd.Series(m_skew)

def kurtos(data, n_bins=200, filterMean=True):
    """kurtosis of the frequency domain signal"""
    m_kurtosis = {}
    for k in data._get_numeric_data():
        c_sig = []
        if filterMean:
            filtered = data[k].dropna().as_matrix()
            mean_sig = np.ones_like(filtered)*np.mean(filtered)
            # remove mean of the signal, for better results.
            c_sig = data[k].dropna().as_matrix() - mean_sig
        freqsig = fft.fft(c_sig,n=n_bins) 
        half_freq_domain = freqsig[:int(n_bins/2)]
        #get index in the freq domain
        m_kurtosis[k] = kurtosis(c_sig)
    return pd.Series(m_kurtosis)

def bandsEnergy():
    """Energy of a frequency interval within the bins of the FFT."""
    pass

def angle():
    """Angle between to vectors."""
    pass

def arCoeff(): 
    """Autorregresion coefficients with Burg order equal to 4"""
    pass

def correlation_acc(data): 
    """correlation coefficient between two accelerometer signals"""
    cor = data[["accY", "accX", "accZ", "gyroZ", "gyroX", "gyroY"]].corr().to_dict()['accY']
    res = {}
    for k,v in cor.iteritems():
        if k == 'accY':
            continue
        res[k+'-accY'] = v
    return res

def correlation_kinect(data): 
    """correlation coefficient between two accelerometer signals"""
    cor = flatten_dict(data[["ci","proximity"]].corr().to_dict()["ci"])
    res = {}
    for k,v in cor.iteritems():
        if k == 'ci':
            continue
        res[k+'-ci'] = v
    return res

In [30]:
def loadFile(file_name, verbose=False):
    ##NOTE: IF "TOO MANY VALUES TO UNPACK" ERROR IN THE getCSV METHOD, RESTART THE KERNEL. SOMETHING MUST BE WRONG WITH
    # THE KERNEL INITIALIZATION. MUST BE CHECKED! (LOW-PRIORITY)

    csv_data = None          # the variable where the loaded csv data is stored.
    num_windows = 0          # the number of windows loaded.
    windows = []             # the list of windows data. Each element is a pandas dataframe 
                             #  corresponding to the windows. The list is of size 'num_windows'.

    print '-- Processing: "{}"'.format(file_name)

    # load the data, abort in case of error.
    try:
        num_windows, csv_data = getCSV(os.path.join(csv_dir, file_name))
    except ValueError as e:
        print traceback.format_exc()
        sys.exit(-1)

    for w in range(num_windows):
        win_data = {}
        for k in csv_data.keys():
            # consider the data only if it is not in the ignore list.
            if k not in ignore_col_list:
                if  csv_data[k][w] == []:
                    raise ValueError("\tFile has empty tagged windows. Skipping...")
                win_data[feature_name_map[k]] = csv_data[k][w]
                
        # convert dictionary to dataframe and save it to list of all windows data for the file.
        windows.append(pd.DataFrame.from_dict(win_data))
    
    print '-- Retrieved {} windows in {}'.format(num_windows, file_name)
    
    if verbose:
        overlap_reference = 50
        try:
            _, n_windows, sample_info, avg_overlap, avg_diff = getStatistics(csv_data, compareWith=overlap_reference)
            print "LOAD SUMMARY:"
            print tabulate([[n_windows,"{:.2f}".format(avg_overlap),"{:.2f}".format(avg_diff)]],
                               headers=["#Win", "Avg. Overlap", "Avg. dev. from ref."])
        except ValueError as e:
            print traceback.format_exc()
            return None
    return windows

def get_metadata(data, target="control", all_targets=['control','expectation','high_level','activity']):
    """Export the metadata to a file
    data : the list of windows data.
    target   :   the target we want to export
    """
    numeric_cols = data[0]._get_numeric_data().columns.values
    exclude = list(set(['control','expectation','high_level','activity']) - set(['control'])) + ['time']
    
    X_output_filename = open("{}_X.csv".format(target), 'wa')
    y_output_filename = open("{}_y.csv".format(target), 'wa')
        
    Xwriter = csv.DictWriter(X_output_filename, [k for k in numeric_cols if k not in exclude])
    # write the headers
    Xwriter.writeheader()
    # flush data
    X_output_filename.flush()
    ywriter = csv.writer(y_output_filename)
    ywriter.writerow([target])
    for df in data:
        rows = mean(df).to_dict()
        del rows["time"]
        Xwriter.writerows([rows])   #write content to the file
        #Xwriter.writerows([{}])    #write an empty line to mark the end of the windows
        X_output_filename.flush()   #flush data.
        ywriter.writerow([df[target][0]])
        y_output_filename.flush()
    

def getMetadataForAll(listOfFiles, target="control", all_targets=['control','expectation','high_level','activity']):
    """Export the metadata to a file
    data : the list of windows data.
    target   :   the target we want to export
    """
    exclude = list(set(all_targets) - set(target)) + ['time']
    X_output_filename = None
    y_output_filename = None
    Xwriter = None
    ywriter = None
    data = []
    failed_files = []
    for i, csv_filename in enumerate(listOfFiles):
            
        try:
            data = loadFile(csv_filename)
        except ValueError as e:
            print traceback.format_exc()
            failed_files.append(csv_filename)
            continue
        
        if i == 0:
            numeric_cols = data[0]._get_numeric_data().columns.values
            
            X_output_filename = open("{}_X.csv".format(target), 'wa')
            y_output_filename = open("{}_y.csv".format(target), 'wa')

            Xwriter = csv.DictWriter(X_output_filename, [k for k in numeric_cols if k not in exclude])
            # write the headers
            Xwriter.writeheader()
            # flush data
            X_output_filename.flush()
            ywriter = csv.writer(y_output_filename)
            ywriter.writerow([target])
            
        for df in data:
            rows = mean(df.drop('time', 1)).to_dict()
            Xwriter.writerows([rows])   #write content to the file
            X_output_filename.flush()   #flush data.
            ywriter.writerow([df[target][0]])
            y_output_filename.flush()
        
        ### write an empty line to mark the end of the file
        Xwriter.writerows([{}])
        X_output_filename.flush()
        ywriter.writerows([""])
        y_output_filename.flush()
        #############################
    print '-- List of failed files:\n{} '.format(json.dumps(failed_files, indent=4))

In [32]:
getMetadataForAll(files)

-- Processing: "_2016-11-23-18-49-13_exp1_Player.csv"
-- Retrieved 24 windows in _2016-11-23-18-49-13_exp1_Player.csv
-- Processing: "_2016-11-23-18-49-13_exp2_Player.csv"
-- Retrieved 28 windows in _2016-11-23-18-49-13_exp2_Player.csv
-- Processing: "_2016-11-23-18-49-13_exp3_Player.csv"
-- Retrieved 12 windows in _2016-11-23-18-49-13_exp3_Player.csv
-- Processing: "_2016-11-23-18-49-13_exp4_Player.csv"
-- Retrieved 23 windows in _2016-11-23-18-49-13_exp4_Player.csv
-- Processing: "_2016-11-23-18-49-13_exp5_Player.csv"
-- Retrieved 27 windows in _2016-11-23-18-49-13_exp5_Player.csv
-- Processing: "_2016-11-24-15-43-37_exp1d_Player.csv"
-- Retrieved 31 windows in _2016-11-24-15-43-37_exp1d_Player.csv
-- Processing: "_2016-11-24-15-43-37_exp2d_Player.csv"
-- Retrieved 48 windows in _2016-11-24-15-43-37_exp2d_Player.csv
-- Processing: "_2016-11-24-15-43-37_exp3d_Player.csv"
Traceback (most recent call last):
  File "<ipython-input-30-70d5bdb09872>", line 88, in getMetadataForAll
    data