In [3]:
import subprocess
import pandas as pd
import os
import sys
import pprint
import local_models.local_models
import logging
import ml_battery.log
from Todd_eeg_utils import *

import rpy2
import numpy as np
import rpy2.robjects.numpy2ri
from rpy2.robjects.packages import importr
import matplotlib.pyplot as plt

logger = logging.getLogger(__name__)

In [4]:
data_dir = "/home/brown/disk2/eeg/Phasespace/Phasespace/data/eeg-text" 
transformed_data_dir = "/home/brown/disk2/eeg/transformed_data"

In [5]:
data_info = pd.read_csv(os.path.join(data_dir, "fileinformation.csv"), skiprows=1).iloc[:,2:]

In [6]:
data_info

Unnamed: 0,Unnamed: 2,Number of Records,Time of Seizure,Unnamed: 5,Point of Seizure,250
0,DAT.F00012,3963799,14040,,3510000,3510000
1,DAT.F00013,3632699,12720,,3180000,3180000
2,DAT.F00016,4447824,15960,,3990000,3990000
3,DAT.F00017,1827224,0,,0,0
4,DAT.F00018,2985924,10020,,2505000,2505000
5,DAT.F00019,3692374,12960,,3240000,3240000
6,DAT.F00020,3270974,11280,,2820000,2820000
7,DAT.F00022,2120524,0,,0,0
8,DAT.F00024,3378499,11700,,2925000,2925000
9,DAT.F00026,3370999,0,,0,0


In [7]:
data_info.shape
how_many_epis = len([which for which in range(data_info.shape[0]) if data_info.iloc[which,4]>0])
how_many_epis

40

In [8]:
positive_samples = []
negative_samples = []
for i in range(data_info.shape[0]):
    data_file = data_info.iloc[i,0]
    data_epipoint = data_info.iloc[i,4]
    data_len = data_info.iloc[i,1]
    if data_len > data_epipoint > 0:
        transformed_data_file_dir = os.path.join(transformed_data_dir, data_file)
        transformed_data_files = os.listdir(transformed_data_file_dir)
        negative_data_files = sorted([f for f in transformed_data_files if "negative" in f])
        positive_data_files = sorted([f for f in transformed_data_files if "negative" not in f])
        positive_sample_all_channels = []
        negative_sample_all_channels = []
        for ndf, pdf in zip(negative_data_files, positive_data_files):
            positive_sample_all_channels.append(np.loadtxt(os.path.join(transformed_data_file_dir, pdf))[:,0])
            negative_sample_all_channels.append(np.loadtxt(os.path.join(transformed_data_file_dir, ndf))[:,0])
        positive_samples.append(np.stack(positive_sample_all_channels,axis=1))
        negative_samples.append(np.stack(negative_sample_all_channels,axis=1))


In [9]:
positive_samples[0].shape

(1000, 21)

In [10]:
positive_samples = np.stack(positive_samples)
negative_samples = np.stack(negative_samples)

In [11]:
positive_samples.shape, negative_samples.shape

((39, 1000, 21), (39, 1000, 21))

In [12]:
short_classification_data_dir = os.path.join(data_dir, "shortened_classification_data")

positive_raw_samples = []
negative_raw_samples = []
for i in range(data_info.shape[0]):
    data_file = data_info.iloc[i,0]
    data_epipoint = data_info.iloc[i,4]
    data_len = data_info.iloc[i,1]
    if data_len > data_epipoint > 0:
        shortened_data_onset_file = os.path.join(short_classification_data_dir, "{}_onset.dat".format(data_file))
        shortened_data_negative_file = os.path.join(short_classification_data_dir, "{}_negative.dat".format(data_file))
        positive_raw = np.loadtxt(shortened_data_onset_file)
        negative_raw = np.loadtxt(shortened_data_negative_file)
        #the gprs are subsampled, so subsample the raw data too
        positive_raw_samples.append(positive_raw[::int(positive_raw.shape[0]/positive_samples.shape[1])]) 
        negative_raw_samples.append(negative_raw[::int(negative_raw.shape[0]/negative_samples.shape[1])])

In [13]:
positive_raw_samples = np.stack(positive_raw_samples)
negative_raw_samples = np.stack(negative_raw_samples)

In [14]:
positive_raw_samples.shape, negative_raw_samples.shape

((39, 1000, 21), (39, 1000, 21))

In [15]:
positive_combined_samples = np.concatenate((positive_samples,positive_raw_samples),axis=2)
negative_combined_samples = np.concatenate((negative_samples,negative_raw_samples),axis=2)

In [16]:
positive_combined_samples.shape, negative_combined_samples.shape

((39, 1000, 42), (39, 1000, 42))

In [21]:
np.transpose(positive_combined_samples,(1,0,2)).reshape((positive_combined_samples.shape[1],-1)).shape

(1000, 1638)

In [22]:
positive_combined_samples_reshaped = np.transpose(positive_combined_samples,(1,0,2)).reshape((positive_combined_samples.shape[1],-1))
negative_combined_samples_reshaped = np.transpose(negative_combined_samples,(1,0,2)).reshape((negative_combined_samples.shape[1],-1))

In [23]:
import sklearn.preprocessing
positive_scaler = sklearn.preprocessing.StandardScaler().fit(positive_combined_samples_reshaped)
negative_scaler = sklearn.preprocessing.StandardScaler().fit(negative_combined_samples_reshaped)

In [24]:
positive_combined_samples_reshaped_scaled = positive_scaler.transform(
    positive_combined_samples_reshaped)
negative_combined_samples_reshaped_scaled = negative_scaler.transform(
    negative_combined_samples_reshaped)

In [28]:
positive_combined_samples_scaled = positive_combined_samples_reshaped_scaled.reshape(
    (-1,positive_combined_samples.shape[0],positive_combined_samples.shape[2])).transpose(
    (1,0,2))
negative_combined_samples_scaled = negative_combined_samples_reshaped_scaled.reshape(
    (-1,negative_combined_samples.shape[0],negative_combined_samples.shape[2])).transpose(
    (1,0,2))

In [29]:
positive_combined_samples_scaled.shape, negative_combined_samples_scaled.shape

((39, 1000, 42), (39, 1000, 42))

In [30]:
np.random.seed(0)
indices = list(range(39))
np.random.shuffle(indices)

In [61]:
indices

[4,
 28,
 29,
 33,
 34,
 25,
 10,
 22,
 11,
 27,
 18,
 15,
 2,
 38,
 20,
 36,
 16,
 35,
 8,
 13,
 5,
 17,
 14,
 32,
 7,
 31,
 1,
 26,
 12,
 30,
 24,
 6,
 23,
 21,
 19,
 9,
 37,
 3,
 0]

In [31]:
train_set = indices[:20]
test_set = indices[20:]

In [32]:
positive_train = positive_combined_samples_scaled[train_set]
negative_train = negative_combined_samples_scaled[train_set]
positive_test = positive_combined_samples_scaled[test_set]
negative_test = negative_combined_samples_scaled[test_set]
train = np.concatenate((positive_train, negative_train))
test = np.concatenate((positive_test, negative_test))
train_labels = np.concatenate((np.ones(positive_train.shape[0]), np.zeros(negative_train.shape[0])))
test_labels = np.concatenate((np.ones(positive_test.shape[0]), np.zeros(negative_test.shape[0])))

In [33]:
rpy2.robjects.numpy2ri.activate()
    
# Set up our R namespaces
R = rpy2.robjects.r
DTW = importr('dtw')

In [34]:
cdists = np.empty((test.shape[0], train.shape[0]))

In [35]:
cdists.shape

(38, 40)

In [36]:
timelog = local_models.local_models.loggin.TimeLogger(
    logger=logger, 
    how_often=1, total=len(train_set)*len(test_set)*4, 
    tag="dtw_matrix")

import gc
# Calculate the alignment vector and corresponding distance
for test_i in range(cdists.shape[0]):
    for train_i in range(cdists.shape[1]):
        with timelog:
            alignment = R.dtw(test[test_i], train[train_i], keep_internals=False, distance_only=True)
            dist = alignment.rx('distance')[0][0]
            print(dist)
            cdists[test_i, train_i] = dist
            gc.collect()
            R('gc()')
            gc.collect()
        

9048.358469633236


  places = np.log10(np.abs(number))


9268.067016578814
9277.747595826515
10897.466462300918
9633.286206242574
9256.39547308139
10797.679171652553
11501.856427386027
9578.213779066624
11676.61251368756
11612.851903010429
8742.422380859209
9971.118414780407
10404.39118772073
10638.614630308331
11595.36323652257
8869.46600660516
9720.395619607141
10577.04594852681
9781.271435389104
11791.198465688949
11591.98910093981
11601.23499034779
11050.664621941103
11739.09279926912
11287.429598066201
10398.416287310025
11582.350974365323
11173.343176255521
11560.490482309839
11385.866963789136
10338.27044428807
10983.390495698663
10551.004219836792
11276.42750376145
11774.14970601401
11656.598273511456
10936.165289323391
11069.3227719797
10472.418521278283
10953.035709763197
11285.99841606298
10918.802680180375
12641.832123705695
11531.95232801596
11310.09332827339
12313.95910489963
12781.298931898065
11142.002176988442
13155.306521401359
12748.399624300124
10484.651984691232
11495.667775677442
12085.378421851105
11838.976537782411
12

7937.914513646351
9873.902336292454
8482.006499180974
8460.403656282666
9773.864496604485
12219.687523319004
8625.37969650277
11149.227998851375
10649.448650500555
6527.713680068466
9655.99771013339
10118.109304434782
10368.805540890955
10830.332652263305
8231.875417610006
9074.942968862275
9944.404201880288
9559.033997155993
11523.896089503332
10675.54011345241
10620.28679559352
10460.102060807461
11006.499775865937
10749.699152455609
9330.914678283481
10664.556321957733
11058.578176712643
10526.927610677169
10741.236346929607
9921.858865305534
10338.265477654111
9832.272454322687
10728.566349927942
11191.670196024636
11217.671811699727
10496.9401562013
10594.798436774156
9097.407445019735
11073.785941913067
11615.390521051704
11437.996268833658
12283.58627885016
11942.191234046617
11204.99988829571
12066.640984047059
11693.930763559098
11617.816857776877
12689.426487664956
12578.32785239045
11094.86568871575
11298.903244860569
11650.531369459064
11956.940908132405
12455.963811420053


11759.175607993855
12356.463715302032
12008.107080591526
12540.564554419396
12211.967095942453
12291.769326973128
12423.704145357497
11550.496699975556
12451.983714906522
12837.642488377594
13015.819958502572
11765.181794233818
11902.984169001793
12524.48386821287
12718.140408220013
13326.43758687427
11994.405312890241
11946.116411194516
12828.463562097326
12980.457007905852
12999.815565244704
13094.48423132826
12869.606143537258
12370.808050562378
12792.504635795261
12679.431701212663
12031.806132163374
12982.849037842287
12058.88697558577
13136.762798630132
12812.076428918248
11788.937014094996
12306.926655612995
12248.315248806597
12688.97493841774
13349.166460917388
12618.54659722575
12101.252239495849
12431.451319672573
12172.785136632541
10280.743212852816
10869.108227373119
10621.842780153656
11817.597856949624
10999.833383808782
10206.404150022045
11295.942248858199
11561.079532974207
10819.55965748861
12212.240034097475
12268.069310426588
9822.287357314415
10992.745243872798
1

11567.788006938048
11247.369569863667
11857.876230857762
11451.546282948642
12584.824715423207
11973.398271414144
11539.814231548131
12091.707453424367
11530.042284015599
11898.030478445356
12724.707659327576
12482.868210801325
11147.192849646905
11545.287827814867
12255.676358184015
11856.008370515818
12811.660033671664
11057.851722057654
11770.190961010765
12376.84960823721
11779.338711868602
13016.987356728649
12941.60408371233
12890.0961648505
12534.138513725808
13066.914460819613
12607.707567306374
12106.760658523941
12556.628342776154
12013.951655903158
12921.50391570447
11922.171833429353
11609.241904408364
12161.22591477539
11723.196679785835
12058.20704377596
13031.703847267172
12101.803948086084
12351.930876648767
12250.789894215084
11499.2821655629
12526.441478144732
13320.03943737341
12685.278994810684
13544.514151711739
13470.803998282934
12814.53487812738
13428.101039241737
12857.65754737013
13290.898888282203
13916.96360784568
13805.947046413168
12888.753283976652
12980.

In [37]:
import sklearn.metrics

In [38]:
cdists.shape

(38, 40)

In [39]:
combined_classification_data_dir = os.path.join(data_dir, "combined_classification_data")
os.makedirs(combined_classification_data_dir, exist_ok=1)

In [40]:
cdists_file = os.path.join(combined_classification_data_dir, "dtw_cdists.dat")
if "cdists" in globals() and not os.path.exists(cdists_file):
    np.savetxt(cdists_file, cdists)
else:
    cdists = np.loadtxt(cdists_file)

In [41]:
test_labels.shape

(38,)

In [42]:
np.argmin(cdists, axis=1).shape

(38,)

In [63]:
np.unique(np.argmin(cdists, axis=1), return_counts=True)

(array([ 0,  2,  7, 11, 16]), array([ 1,  1,  2, 22, 12]))

In [43]:
cm = sklearn.metrics.confusion_matrix(test_labels, train_labels[np.argmin(cdists, axis=1)])

In [44]:
print(cm)

[[ 0 19]
 [ 0 19]]


In [45]:
pd.DataFrame(np.round(cdists/10**3,0))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,9.0,9.0,9.0,11.0,10.0,9.0,11.0,12.0,10.0,12.0,...,11.0,10.0,11.0,11.0,11.0,12.0,12.0,11.0,11.0,10.0
1,11.0,11.0,11.0,13.0,12.0,11.0,12.0,13.0,11.0,13.0,...,12.0,12.0,12.0,12.0,12.0,13.0,13.0,12.0,12.0,11.0
2,10.0,10.0,10.0,12.0,11.0,10.0,11.0,11.0,10.0,12.0,...,12.0,11.0,11.0,11.0,11.0,12.0,12.0,12.0,11.0,11.0
3,10.0,10.0,10.0,11.0,10.0,11.0,11.0,13.0,11.0,13.0,...,12.0,11.0,12.0,11.0,12.0,13.0,13.0,12.0,12.0,11.0
4,10.0,10.0,9.0,11.0,10.0,10.0,11.0,13.0,10.0,12.0,...,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,11.0
5,9.0,10.0,9.0,11.0,10.0,10.0,11.0,12.0,10.0,12.0,...,12.0,11.0,11.0,11.0,11.0,12.0,12.0,11.0,11.0,11.0
6,9.0,10.0,9.0,11.0,10.0,9.0,11.0,11.0,10.0,11.0,...,11.0,11.0,11.0,11.0,11.0,12.0,11.0,11.0,11.0,10.0
7,11.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,13.0,...,13.0,12.0,12.0,12.0,13.0,13.0,12.0,12.0,12.0,12.0
8,8.0,9.0,8.0,10.0,9.0,8.0,10.0,9.0,9.0,11.0,...,10.0,9.0,10.0,10.0,10.0,11.0,10.0,10.0,10.0,9.0
9,9.0,9.0,9.0,11.0,9.0,8.0,10.0,12.0,9.0,11.0,...,11.0,10.0,11.0,11.0,11.0,12.0,11.0,11.0,11.0,10.0


In [53]:
np.argmin(cdists, axis=1)

array([11, 11, 16, 11, 11, 11,  2, 11, 16, 11, 16, 11, 16, 11, 11,  7, 11,
       16, 11, 11, 11, 16,  7, 11, 11,  0, 16, 11, 11, 16, 16, 16, 11, 16,
       16, 11, 11, 11])

In [57]:
cols = [0,5,11,16,22,25,26,32,37]
pd.DataFrame(np.round(cdists[:,cols]/10**2,0),columns=cols)

Unnamed: 0,0,5,11,16,22,25,26,32,37
0,90.0,93.0,87.0,89.0,116.0,113.0,104.0,110.0,109.0
1,110.0,113.0,105.0,106.0,129.0,127.0,118.0,123.0,124.0
2,100.0,96.0,98.0,96.0,123.0,119.0,111.0,113.0,117.0
3,102.0,108.0,89.0,105.0,118.0,120.0,108.0,116.0,118.0
4,98.0,96.0,94.0,97.0,123.0,117.0,107.0,117.0,121.0
5,94.0,97.0,94.0,97.0,119.0,115.0,104.0,112.0,110.0
6,92.0,92.0,96.0,93.0,121.0,111.0,104.0,108.0,109.0
7,115.0,119.0,114.0,115.0,128.0,125.0,119.0,123.0,122.0
8,83.0,81.0,83.0,75.0,110.0,104.0,97.0,99.0,101.0
9,86.0,83.0,80.0,81.0,115.0,111.0,104.0,107.0,111.0


In [58]:
pd.DataFrame(cm, index=[["true"]*2,["-","+"]], columns=[["pred"]*2, ["-", "+"]])

Unnamed: 0_level_0,Unnamed: 1_level_0,pred,pred
Unnamed: 0_level_1,Unnamed: 1_level_1,-,+
True,-,0,19
True,+,0,19


In [59]:
acc = np.sum(np.diag(cm))/np.sum(cm)
prec = cm[1,1]/np.sum(cm[:,1])
rec = cm[1,1]/np.sum(cm[1])
acc,prec,rec

(0.5, 0.5, 1.0)