In [244]:
# Useful imports and setup
import sys
import os
# Necessary to import code from ../scripts/
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/scripts")
    

import numpy as np
import pandas as pd
import tensorflow as tf

from preprocessing import preprocess
from cluster import *
from baseline import baseline_model, train_model, train_val_split

%load_ext autoreload
%autoreload 2

MODEL_PATH = '../models/saved/'
labels_path = '../labels/'
cluster_model = 'kmeans'
labels_cluster_path = labels_path + cluster_model + '/'
data_path = '../data/'
course = 'dsp_001'
path = data_path + course + '/'
feature_types = ['lalle_conati', 'boroujeni_et_al', 'chen_cui', 'marras_et_al']
metadata = pd.read_csv(data_path + 'metadata.csv')
hard_fail = path + 'feature_labels.csv'
percentile = 0.6

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Preprocess data

In [225]:
x_train, x_test, x_val, y_train, y_test, y_val, feature_names, patterns = preprocess(course, path, percentile, feature_types, metadata, hard_fail)

### Baseline cluster

In [226]:
# Concat features & labels
X = np.concatenate([x_train, x_val, x_test], axis=0)
Y = np.concatenate([y_train, y_val, y_test], axis=0)
P = np.concatenate(patterns, axis=0)

X_flatten = tf.reshape(X,[X.shape[0], X.shape[1]*X.shape[2]])
print("X_flatten shape: {0}".format(X_flatten.shape))

X_flatten shape: (5611, 270)


In [227]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

score, n = compute_number_clusters(X_flatten, kmeans_, silhouette_score)

(0.12742339709773967, 2)
(0.13441100656217003, 3)
(0.12009081886613819, 4)
(0.11131791149756463, 5)
(0.11587989624886949, 6)
(0.12433508706624707, 7)
(0.12502867152972835, 8)
(0.13177933338522793, 9)
(0.13067800476772654, 10)


In [228]:
model = kmeans_(n)
result = model.fit(X_flatten)
labels = result.labels_

#### Save cluster labels

In [246]:
if not os.path.exists(labels_cluster_path):
    os.makedirs(labels_cluster_path)

np.savetxt(labels_cluster_path+'baseline_cluster_labels.txt', labels, fmt='%d')

### Baseline classification for each cluster

Using a baseline BiLSTM model

In [243]:
for c in range(np.max(labels)+1):
    print('\n\nTraining predictions for cluster {0}'.format(c))
    
    idx = np.where(labels == c)[0]
    X_train = X[idx]
    Y_train = Y[idx]
    
    baseline_params = {
    'name': 'baseline-32u-1l',
    'optimizer': 'adam',
    'loss': 'binary_crossentropy',
    'metrics': ['binary_accuracy'],
    'epochs': 20,
    'batch_size': 64,
    'verbose': 1
    }
    baseline = baseline_model()

    x_train, x_val, y_train, y_val = train_val_split(X_train, Y_train)
    train_model(baseline, x_train, y_train, x_val, y_val, baseline_params)
    baseline.save_weights(MODEL_PATH + 'baseline_classifier_for_cluster_'+str(c))

Training prediction for cluster 0
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training prediction for cluster 1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training prediction for cluster 2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
