In [1]:
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from keras.models import model_from_json
import os

# LOAD AND USE MODEL
json_file = open('../src/models/LSTM-100PCA-DeepDrop_all_GM_LOSO_Social_1.model.json','r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("../src/models/LSTM-100PCA-DeepDrop_all_GM_LOSO_Social_1.weights.hdf5")
print("Loaded model from disk")
# evaluate loaded model on test data
# IMPORTANT: compile the model again before use!
loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


Using Theano backend.


Loaded model from disk


In [3]:
from sklearn import decomposition
from keras.utils.np_utils import to_categorical
import math

#Getting to load the train/test data itself
# For tests
label='LSTM-100PCA-DeepDrop_et_GM_LOSO_Social_1'
target='Social'
datasourcestring='all'
trainstring='case1-day1-session2-teacher1,case1-day1-session3-teacher1,case1-day1-session4-teacher1,case2-day1-session1-teacher2,case2-day1-session2-teacher2,case2-day2-session1-teacher2,case2-day2-session2-teacher2,case2-day3-session1-teacher2,case2-day3-session2-teacher2,case2-day4-session1-teacher2,case2-day4-session2-teacher2'
teststring='case1-day1-session1-teacher1'



# We parse the data sources to take into account, and sessions for the train and test sets
features = range(0,2) # These are only the session and timestamp
sources = datasourcestring.split(",")
for source in sources:
    if(source=='all'):
        features.extend(range(2,7557))
        break
    elif(source=='et'):
        features.extend(range(2,12))
    elif(source=='acc'):
        features.extend(range(12,152))
    elif(source=='aud'):
        features.extend(range(152,6557))
    elif(source=='vid'):
        features.extend(range(6557,7557))
    else:
        sys.exit("Wrong data sources. Possible values: all,et,acc,aud,vid")
features.extend(range(7557,7559)) # Add activity and Social
print("Selected features: "+str(len(features)))

sessiontrain = trainstring.split(",") # Gives an array of the sessions to train in
sessiontest = teststring.split(",") # Gives an array of the sessions to train in

if(len(sessiontrain)==0 | len(sessiontest)==0):
    sys.exit("Wrong train/test sessions specification. Should be a comma-separated string with the sessions identificators")

#path = os.path.dirname(os.path.realpath(sys.argv[0]))
# READING AND PREPARING THE DATA
#processeddatadir = path
processeddatadir = "../src/models" # TODO: Change this for the actual script
datafile = os.path.join(processeddatadir,'completeDataset.csv')
gzdatafile = os.path.join(processeddatadir,'completeDataset.csv.gz')
fulldata = pandas.DataFrame()
if(os.path.isfile(datafile)):
    fulldata = pandas.read_csv(datafile, sep=',', quotechar='"')
elif(os.path.isfile(gzdatafile)):
    fulldata = pandas.read_csv(gzdatafile, compression='gzip', sep=',', quotechar='"')
else:
    sys.exit("Data not available in the script's folder")

# Drop the useless first column
fulldata.drop(fulldata.columns[[0]],axis=1,inplace=True)

def cleanAct(value):
    if pandas.isnull(value):
        return 'Other'
    elif value=='OFF' or value=='TDT' or value=='TEC':
        return 'Other'
    else:
        return value

def cleanSoc(value):
    if pandas.isnull(value):
        return 'Other'
    else:
        return value


# We only look for predicting 4 states of activity and 3 of social, the rest (incl.NA) we bunch in 'Other' (so in the end it is a 5- and 4-class classification problem)
fulldata['Activity'] = fulldata['Activity.win'].map(cleanAct)
fulldata['Social'] = fulldata['Social.win'].map(cleanSoc)

# Drop the useless first column
fulldata.drop(fulldata.columns[[2,3,4]],axis=1,inplace=True)
print(fulldata.columns.values[0:5],"...",fulldata.columns.values[-5:])
#fulldata.head(3)

# Now the column indices match what is expected in the arguments parsed above
# * [,0]: ''session id''
# * [,1]: ''timestamp'' within the session (in ms)
# * [,2:12]: ''eyetracking'' features (mean/sd pupil diameter, nr. of long fixations, avg. saccade speed, fixation duration, fixation dispersion, saccade duration, saccade amplitude, saccade length, saccade velocity)
# * [,12:152]: ''accelerometer'' features, including X, Y, Z (mean, sd, max, min, median, and 30 FFT coefficients of each of them) and jerk (mean, sd, max, min, median, and 30 FFT coefficients of each of it)
# * [,152:6557]: ''audio'' features extracted from an audio snippet of the 10s window, using openSMILE. Includes features about whether there is someone speaking (153:163), emotion recognition models (164:184), and brute-force audio spectrum features and characteristics used in various audio recognition challenges/tasks (185:6557)
# * [,6557:7557]: ''video'' features extracted from an image taken in the middle of the window (the 1000 values of the last layer when passing the immage through a VGG pre-trained model)
# * [,7557:7559]: ''Activity,Social'' labels we want to predict


# SELECTING THE DATASET FEATURES (DATA SOURCES BEING TRIED)
data = fulldata.ix[:,features]

# We drop the non-needed target variable
if target == 'Activity':
    data.drop('Social',axis=1,inplace=True)
elif target == 'Social':
    data.drop('Activity',axis=1,inplace=True)

print(data.shape)
#data.head(3)

# SPLITTING THE DATA
test = data.loc[data['session'].isin(sessiontest)]
train = data.loc[data['session'].isin(sessiontrain)]
print(test.shape)
print(train.shape)
# Removing null values
test = test[test.notnull().all(axis=1)]
train = train[train.notnull().all(axis=1)]
print("Removing null and NAs...")
print(test.shape)
print(train.shape)


X_train = train.values[:,range(2,train.shape[1]-1)].astype(float)
Y_train = train.values[:,(train.shape[1]-1)]
X_test = test.values[:,range(2,test.shape[1]-1)].astype(float)
Y_test = test.values[:,(test.shape[1]-1)]
Y_total = data.values[:,(data.shape[1]-1)]
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

#######################################################
# DO OTHER DATA TRANSFORMATIONS NEEDED, e.g. PCA, SELECT K-BEST FEATURES, etc (NORMALLY, ON THE TRAIN SET ONLY, TO BE APPLIED LATER TO THE TEST SET)
print("Transforming data... ")
k = 100
outs = len(data[target].unique())

# We standardize on the basis of the training data
scaler = StandardScaler().fit(X_train)
X_train_st = scaler.transform(X_train)
X_test_st = scaler.transform(X_test)

# # Removing zero variance features
# selector = VarianceThreshold()
# selector.fit(X_train_st)
# X_train_nz = selector.transform(X_train_st)
# X_test_nz = selector.transform(X_test_st)
# idx = numpy.where(selector.variances_ > threshold)[0] # to get the indices
# TODO: Remove highly correlated ones (according to Cohen's d?)
## From http://lucystatistics.blogspot.com.ee/2016/03/dimension-reduction.html
# c = df.corr().abs()
# so1=argsort(np.array(c))
# s = c.unstack()
# so2 = s.order(kind="quicksort")


if X_train.shape[1]>k:
    # Apply 100-component pca
    print("PCA with "+str(k)+" components")
    pca = decomposition.PCA(n_components=k)
    X_train_pca = pca.fit_transform(X_train_st)
    X_test_pca = pca.transform(X_test_st)
    print 'Variance explained:'
    print pca.explained_variance_ratio_
    print 'Total variance explained by '+str(k)+' components:'
    print sum(pca.explained_variance_ratio_)
else:
    k=X_train.shape[1]
    pca = decomposition.PCA(n_components=k)
    X_train_pca = pca.fit_transform(X_train_st)
    X_test_pca = pca.transform(X_test_st)

#######################################################


# PREPARING THE DATA FOR KERAS TRAINING
# One hot encoding of the response variable (using dummy variables)

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y_total)
encoded_Y_train = encoder.transform(Y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = to_categorical(encoded_Y_train)
#encoder.fit(Y_test)
encoded_Y_test = encoder.transform(Y_test)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_test = to_categorical(encoded_Y_test)


seed = 66
numpy.random.seed(seed)


def slice_timeseries(X,Y,length=32):
    # TODO: consider more randomized slicing
    n = int(math.floor(X.shape[0] / length))
    maxt = n * length
    X = X[0:maxt,:].reshape((n, length, X.shape[1]))
    Y = Y[0:maxt,:].reshape((n, length, Y.shape[1]))
    return X,Y

X_train_pca_reshaped, dummy_y_train_reshaped = slice_timeseries(X_train_pca, dummy_y_train, length=16)
X_test_pca_reshaped, dummy_y_test_reshaped = slice_timeseries(X_test_pca, dummy_y_test, length=32)


Selected features: 7559
(array(['session', 'timestamp', 'value.Mean', 'value.SD', 'value.Fix'], dtype=object), '...', array(['V998', 'V999', 'V1000', 'Activity', 'Social'], dtype=object))
(5561, 7558)
(461, 7558)
(5100, 7558)
Removing null and NAs...
(455, 7558)
(5061, 7558)
((5061, 7555), (5061,), (455, 7555), (455,))
Transforming data... 
PCA with 100 components
Variance explained:
[ 0.19377148  0.08003752  0.03967616  0.02126597  0.01905889  0.01672362
  0.01431736  0.01164806  0.0111452   0.00970049  0.00875957  0.00787292
  0.00725064  0.00706045  0.00651681  0.00596103  0.00536422  0.00535148
  0.00511501  0.00460986  0.00443447  0.00423327  0.00392198  0.0034854
  0.00345562  0.00337559  0.00322898  0.00305009  0.00304195  0.00295511
  0.00283019  0.0027339   0.00266283  0.00257872  0.00254498  0.00248343
  0.00235735  0.00232665  0.0022771   0.00223077  0.00220502  0.00218941
  0.00212522  0.00208172  0.00202671  0.00195109  0.00193412  0.00189897
  0.00186267  0.00180383  0.00

In [5]:
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, f1_score, cohen_kappa_score
from sklearn.cross_validation import cross_val_score

Y_pred = loaded_model.predict(X_test_pca.reshape((1,) + X_test_pca.shape ))[0,:,:]

# Accuracy
print('Accuracy:')
acc = accuracy_score(numpy.argmax(dummy_y_test, axis=1), numpy.argmax(Y_pred, axis=1))
print(acc)


# Confusion matrix
cm = confusion_matrix(numpy.argmax(dummy_y_test, axis=1), numpy.argmax(Y_pred, axis=1))
numpy.set_printoptions(precision=2)
print('Confusion matrix:')
print(cm)

# AUC
roc = None
try:
  roc = roc_auc_score(dummy_y_test, Y_pred, average='macro')
except:
  pass
print('AUC score:')
print(roc)

# F1
f1= f1_score(numpy.argmax(dummy_y_test, axis=1), numpy.argmax(Y_pred, axis=1), average='macro')
print('F1 score:')
print(f1)


# KAppa?
kappa = cohen_kappa_score(numpy.argmax(dummy_y_test, axis=1), numpy.argmax(Y_pred, axis=1))
print('Kappa score:')
print(kappa)


Accuracy:
0.81978021978
Confusion matrix:
[[217  34   0   2]
 [ 17 141   0   4]
 [  0  10   0   0]
 [ 14   1   0  15]]
AUC score:
0.760041693401
F1 score:
0.566211896693
Kappa score:
0.67133255226


  'precision', 'predicted', average, warn_for)


In [6]:
# Read the performances calculated while training
perfdata = pandas.read_csv('../src/models/LSTM-100PCA-DeepDrop_all_GM_LOSO_Social_1.perf.csv', sep=',', quotechar='"')
perfdata

Unnamed: 0,acc,auc,cm,f1,kappa,label
0,0.789011,0.796477,"[[218, 35, 0, 0], [33, 128, 0, 1], [3, 6, 0, 1...",0.545449,0.605833,LSTM-100PCA-DeepDrop_all_GM_LOSO_Social_1


We might need to recalculate all the python performances!