In [None]:
import numpy as np
import scipy
from scipy import signal
import mne
import glob
from sklearn.decomposition import PCA
import csv

In [None]:
local_path = './../../../data/raw/HV/HV1/HV1_F1_L_Removed_ICA.set'


In [None]:
# All datasets must have the same channels!
channel_names = mne.io.read_epochs_eeglab(local_path).ch_names

In [None]:
pca_channels = 3

def pca(channels, n_components=5):
    pca = PCA(n_components=n_components)
    return pca.fit_transform(channels)

In [None]:
def log_result(file, name, accuracy, patients_correct, patients_total, set_name, channels, notes):
    writer = csv.writer(file)
    writer.writerow([name, str(accuracy*100) + '%', ' ' + str(patients_correct) + '/' + str(patients_total), set_name, channels, notes])
        

# Read and process data sets, apply dimensionality reduction

In [None]:
# Returns relevant datasets (f.e. all right-hand recordings of healthy patients) as a list of np arrays
def get_datasets(patient_type_location, recording_type_expression):
    if recording_type_expression != l_new:
        sets_locations = glob.glob(patient_type_location + recording_type_expression + suffix)
    else:
        sets_locations = glob.glob(patient_type_location + recording_type_expression)
    
    sets = []
    for path in sets_locations: 
        epochs = mne.io.read_epochs_eeglab(path)
        sets.append(mne.io.read_epochs_eeglab(path))
        channel_names = epochs.ch_names
        
    return np.array(np.array([(patient._data) for patient in sets]))

In [None]:
root = './../../../'
suffix = '*.set'

location_healthy = root + 'data/raw/HV/*/'
location_pain = root + 'data/raw/PP/*/'
location_nopain = root + 'data/raw/PnP/*/'

location_pwp = root + 'data_new/raw/PwP/*/'
location_pdp = root + 'data_new/raw/PdP/*/'
location_pnp = root + 'data_new/raw/PnP/*/'


rh = '*_RH*'
lh = '*_LH*'
l_new = '*_L.set'   # NO SUFFIX
l_old = '*_L_*'

sets_healthy_rh = glob.glob(location_pnp + l_new)

sets_healthy_rh

In [None]:
pp_rh_raw = get_datasets(location_pain, rh)
pnp_rh_raw = get_datasets(location_nopain, rh)

In [None]:
pp_rh_raw

In [None]:
pp_rh_raw[4].shape

In [None]:

pp_rh = np.array([np.array([pca(arr) for arr in patient]) for patient in pp_rh_raw])
pnp_rh = np.array([np.array([pca(arr) for arr in patient]) for patient in pnp_rh_raw])


#pp_rh = pp_rh_raw
#pnp_rh = pnp_rh_raw

In [None]:
pp_count = np.vstack(pp_rh).shape[0]
pnp_count = np.vstack(pnp_rh).shape[0]
pnp_count

In [None]:
pnp_rh.shape

Set some patients aside for testing

In [None]:
pp_and_pnp_bp = np.concatenate((pp_rh, pnp_rh))
pp_and_pnp_bp.shape

In [None]:
# Returns a pair consisting of boolean (True is test patient is PP) and test label
def test_setup(test_index, total_size):
    test_is_pp = test_index < len(pp_rh)
    test_label = 1 if test_is_pp else 0
    return test_is_pp, test_label

# Splits into train and test based on the index of the test patient
# Returns pair of test and train
def get_train_test(data, test_index):
    return data[test_index], np.delete(data, test_index)

# Returns pair of the lengths of PP train data and respectively PNP train data
def get_pp_pnp_length(pp_count, pnp_count, test_count, test_is_pp):
    pp_train_len = pp_count if not test_is_pp else pp_count - test_count
    pnp_train_len = pnp_count if test_is_pp else pnp_count - test_count
    return pp_train_len, pnp_train_len

# Ravel first dimention so that trials from all patients are treated separately; select channels
def ravel_all_trials(data, channels):
    return np.array(list(map(np.ravel, data[:, channels, :])))

In [None]:
test_index = 4

test_is_pp, test_label = test_setup(test_index, len(pp_rh))
test_label

In [None]:
test_p, train_p = get_train_test(pp_and_pnp_bp, test_index)
test_p.shape

In [None]:
train_p_separated = np.vstack(train_p)
train_p_separated.shape

In [None]:
pp_train_len, pnp_train_len = get_pp_pnp_length(pp_count, pnp_count, len(test_p), test_is_pp)
pp_train_len

In [None]:
mul = 1

In [None]:
selected_channels = [10, 11]

In [None]:
train = ravel_all_trials(train_p_separated, selected_channels) * mul
train.shape

In [None]:
test = ravel_all_trials(test_p, selected_channels) * mul
test.shape

In [None]:
train

In [None]:
labels = [1] * pp_train_len + [0] * pnp_train_len
test_labels = [test_label] * len(test)

It's time to learn

In [None]:
from sklearn import neighbors, svm
from sklearn.model_selection import train_test_split

In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors=130)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train, labels, test_size=0.05)

In [None]:
knn.fit(x_train, y_train)

In [None]:
knn.score(x_train, y_train)

In [None]:
knn.score(x_test, y_test)

In [None]:
knn.predict(test)

In [None]:
np.count_nonzero(knn.predict(test) == test_labels)/len(test)

### Cross validation

In [None]:
def classify_knn_with_xvalid(data_pp_bp, data_pnp_bp, n_neighbours, selected_channels, test_index, mul, verbose=True):
    
    data_bp = np.concatenate((data_pp_bp, data_pnp_bp))
    
    test_is_pp, test_label = test_setup(test_index, len(data_pp_bp))
    test_p, train_p = get_train_test(data_bp, test_index)
    train_p_separated = np.vstack(train_p)
    pp_train_len, pnp_train_len = get_pp_pnp_length(pp_count, pnp_count, len(test_p), test_is_pp)
    
    train = ravel_all_trials(train_p_separated, selected_channels) * mul
    test = ravel_all_trials(test_p, selected_channels) * mul
    
    labels = [1] * pp_train_len + [0] * pnp_train_len
    test_labels = [test_label] * len(test)
    
    if verbose:
        print('Test index', test_index, 'Preparing to classify set of', pp_train_len, 'PP and', pnp_train_len, 'PNP.')
    
    clas = neighbors.KNeighborsClassifier(n_neighbors=n_neighbours)
    clas.fit(train, labels)
    train_acc = clas.score(train, labels)
    test_acc = clas.score(test, test_labels)
    
    if verbose:
        print('Train score:', train_acc, '  Test score:', test_acc)
    
    return test_acc
    
    

In [None]:
classify_knn_with_xvalid(pp_rh, pnp_rh, 23, [0, 3, 10, 36], 2, 10000000000000)

#### Cross validate over the whole dataset

In [None]:
total_score = 0
patients_correct = 0
for i in range(len(pp_and_pnp_bp)):
    score = classify_knn_with_xvalid(pp_rh, pnp_rh, 80, [0], i, mul)
    total_score += score
    if score > 0.5:
        patients_correct += 1
    
# TODO log acc for each patient
    
print(total_score/len(pp_and_pnp_bp))
print('Correctly labeled', patients_correct, 'out of', len(pp_and_pnp_bp))

#### Cross validate over multiple channels

In [None]:
file = open('all_results/pca_knn_results.csv', 'a', newline='')
n_components = 5
name = 'PCA + KNN'
notes = 'k='
notes_c = ', n_components='

previous_channels = [45, 47, 47, 51]
k = 90

max_acc = {'index': 0, 'value': 0}
for channel in range(61):    
    total_score = 0
    correct_patients = 0
    for i in range(len(pp_and_pnp_bp)):
        score = classify_knn_with_xvalid(pp_rh, pnp_rh, 19, previous_channels + [channel], i, mul, verbose=False)
        total_score += score
        if score > 0.5:
            correct_patients += 1
        
    avg_score = total_score/len(pp_and_pnp_bp)
    print(channel, avg_score, correct_patients)

    log_result(file, name, avg_score, correct_patients, len(pp_and_pnp_bp), 'RH', str(previous_channels + [channel]), notes + str(k) + notes_c + str(n_components))

    if avg_score > max_acc['value']:
        max_acc['index'] = channel
        max_acc['value'] = avg_score
        
print('Max accuracy:', max_acc['index'], max_acc['value'])

### Cross validate over multiple n_neighbours

In [None]:
file = open('all_results/pca_knn_results.csv', 'a', newline='')
n_components = 5
name = 'PCA + KNN'
notes = 'k='
notes_c = ', n_components='

channels = [45, 47, 47, 51]

max_acc = {'index': 0, 'value': 0}
for n_neighbours in range(70, 115, 1):    
    total_score = 0
    correct_patients = 0
    for i in range(len(pp_and_pnp_bp)):
        score = classify_knn_with_xvalid(pp_rh, pnp_rh, n_neighbours, [channels], i, mul, verbose=False)
        total_score += score
        if score > 0.5:
            correct_patients += 1
        
    avg_score = total_score/len(pp_and_pnp_bp)
    print(n_neighbours, avg_score, correct_patients)
    
    log_result(file, name, avg_score, correct_patients, len(pp_and_pnp_bp), 'RH', str(channels), notes + str(n_neighbours) + notes_c + str(n_components)) 
    
    if avg_score > max_acc['value']:
        max_acc['index'] = n_neighbours
        max_acc['value'] = avg_score
        
print('Max accuracy:', max_acc['index'], max_acc['value'])

In [None]:
file.close()

### SVM

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
def classify_svm_with_xvalid(data_pp_bp, data_pnp_bp, nu, selected_channels, test_index, mul, verbose=True):
    
    data_bp = np.concatenate((data_pp_bp, data_pnp_bp))
    
    test_is_pp, test_label = test_setup(test_index, len(data_pp_bp))
    test_p, train_p = get_train_test(data_bp, test_index)
    train_p_separated = np.vstack(train_p)
    pp_train_len, pnp_train_len = get_pp_pnp_length(pp_count, pnp_count, len(test_p), test_is_pp)
    
    train = ravel_all_trials(train_p_separated, selected_channels) * mul
    test = ravel_all_trials(test_p, selected_channels) * mul
    
    #train = pca(ravel_all_trials(train_p_separated, selected_channels) * mul, n_components=pca_channels)
    #test = pca(ravel_all_trials(test_p, selected_channels) * mul, n_components=pca_channels)
    
    labels = [1] * pp_train_len + [0] * pnp_train_len
    test_labels = [test_label] * len(test)
    
    if verbose:
        print('Test index', test_index, 'Preparing to classify set of', pp_train_len, 'PP and', pnp_train_len, 'PNP.')
    
    clas = svm.NuSVC(nu=nu, kernel='linear')
    clas.fit(train, labels)
    train_acc = clas.score(train, labels)
    test_acc = clas.score(test, test_labels)
    
    if verbose:
        print('Train score:', train_acc, '  Test score:', test_acc)
    
    return test_acc
    

In [None]:
total_score = 0
patients_correct = 0
for i in range(len(pp_and_pnp_bp)):
    score = classify_svm_with_xvalid(pp_rh, pnp_rh, 0.7005, [9, 9, 10, 11, 31, 33, 33, 33, 33, 39, 53, 58, 58, 58], i, mul)
    total_score += score
    if score > 0.5:
        patients_correct += 1
    

print(total_score/len(pp_and_pnp_bp))
print('Correctly labeled', patients_correct, 'out of', len(pp_and_pnp_bp))

In [None]:
file = open('all_results/pca_svm_results.csv', 'a', newline='')
n_components = pca_channels
name = 'PCA + SVM'
notes = 'nu='
notes_c = ', n_components='

previous_channels=[4]
nu = 0.7

max_acc = {'index': 0, 'value': 0}
for channel in range(61):    
    total_score = 0
    correct_patients = 0
    for i in range(len(pp_and_pnp_bp)):
        score = classify_svm_with_xvalid(pp_rh, pnp_rh, nu, previous_channels + [channel], i, mul, verbose=False)
        total_score += score
        if score > 0.5:
            correct_patients += 1
        
    avg_score = total_score/len(pp_and_pnp_bp)
    print(channel, avg_score, correct_patients)
   
    log_result(file, name, avg_score, correct_patients, len(pp_and_pnp_bp), 'RH', str(previous_channels + [channel]), notes + str(nu) + notes_c + str(n_components))

    if avg_score > max_acc['value']:
        max_acc['index'] = channel
        max_acc['value'] = avg_score
        
print('Max accuracy:', max_acc['index'], max_acc['value'])

In [None]:
file = open('all_results/pca_svm_results.csv', 'a', newline='')
n_components = pca_channels
name = 'PCA + SVM'
notes = 'nu='
notes_c = ', n_components='

previous_channels=[]
nu = 0.7005

max_acc = {'index': 0, 'value': 0}
for channel_1 in range(61):
    for channel_2 in range(61):
        total_score = 0
        correct_patients = 0
        for i in range(len(pp_and_pnp_bp)):
            score = classify_svm_with_xvalid(pp_rh, pnp_rh, nu, previous_channels + [channel_1, channel_2], i, mul, verbose=False)
            total_score += score
            if score > 0.5:
                correct_patients += 1

        avg_score = total_score/len(pp_and_pnp_bp)
        print(channel_1, channel_2, avg_score, correct_patients)

        log_result(file, name, avg_score, correct_patients, len(pp_and_pnp_bp), 'RH', str(previous_channels + [channel_1, channel_2]), notes + str(nu) + notes_c + str(n_components))
        
        if avg_score > max_acc['value']:
            max_acc['index'] = channel_1, channel_2
            max_acc['value'] = avg_score
        
print('Max accuracy:', max_acc['index'], max_acc['value'])

In [None]:
file = open('all_results/pca_svm_results.csv', 'a', newline='')
n_components = pca_channels
name = 'PCA + SVM'
notes = 'nu='
notes_c = ', n_components='

channels=[4]

max_acc = {'index': 0, 'value': 0}
for nu in np.arange(0.1, 0.8, 0.01):    
    total_score = 0
    correct_patients = 0
    for i in range(len(pp_and_pnp_bp)):
        score = classify_svm_with_xvalid(pp_rh, pnp_rh, nu, channels, i, mul, verbose=False)
        total_score += score
        if score > 0.5:
            correct_patients += 1
        
    avg_score = total_score/len(pp_and_pnp_bp)
    print(nu, avg_score, correct_patients)
    
    log_result(file, name, avg_score, correct_patients, len(pp_and_pnp_bp), 'RH', str(channel), notes + str(nu) + notes_c + str(n_components))
    
    if avg_score > max_acc['value']:
        max_acc['index'] = nu
        max_acc['value'] = avg_score
        
print('Max accuracy:', max_acc['index'], max_acc['value'])