# Classify using start = 1 0 mid = 0 0 end = 0 1


In [1]:
import sys
import pandas as pd
import numpy as np

## 1. Read dataset

In [2]:
dirname = '../../datasets/marked_start_end/'
suffix = '.csv'

def read_from_file( file_name, rand_num):
    
    try:
        full_file_name = file_name + str(rand_num)

        file_at_path = dirname + full_file_name + suffix

        data = pd.read_csv(file_at_path)
        data = data.drop( columns=['FrameNo'] )

        #target_labels = [col for col in data.columns if '_z' in col]
        target_labels = ['start','end']
        
        target_data = data[target_labels]
        input_data = data.drop(columns=target_labels)

        return input_data, target_data, full_file_name
    
    except IOError as e:
        print(e)
        return None,None,None

## 2. Read files

### 2.1 The A series (A1-A159)

In [3]:
X, y, full_file_name = read_from_file("A", 1)

#print(full_file_name)
print(X.shape)
print(y.shape)


for i in range(2,160):
    input_data, target_data, full_file_name = read_from_file("A", i)
    
    if(full_file_name is None):
        continue
    else:
        #print(full_file_name)
        X = X.append(input_data, ignore_index = True)
        y = y.append(target_data, ignore_index = True)

print(X.shape)
print(y.shape)

(229, 26)
(229, 2)
[Errno 2] No such file or directory: '../../datasets/marked_start_end/A60.csv'
[Errno 2] No such file or directory: '../../datasets/marked_start_end/A107.csv'
(33093, 26)
(33093, 2)


### 2.2 The B series (B1-B22)

In [4]:
for i in range(1,23):
    input_data, target_data, full_file_name = read_from_file("B", i)
    
    if(full_file_name is None):
        continue
    else:
        #print(full_file_name)
        X = X.append(input_data, ignore_index = True)
        y = y.append(target_data, ignore_index = True)
        
print(X.shape)
print(y.shape)

(38488, 26)
(38488, 2)


In [5]:
#start_1_0, none_0_0, end_0_1 = np.bincount(y['start','end'])
#print(start_1_0)
#print(none_0_0)
#print(end_0_1)

In [6]:
#from sklearn.utils import shuffle
#X, y = shuffle(X,y)

In [7]:
from sklearn.model_selection import train_test_split
y_train, y_test, X_train, X_test  = train_test_split(y, X, train_size = 0.8, random_state=5)

### Normalization
Normalize the input features using the sklearn StandardScaler. This will set the mean to 0 and standard deviation to 1.

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
y_train = scaler.fit_transform(y_train)
y_test = scaler.fit_transform(y_test)

In [9]:
import tensorflow as tf
from tensorflow import keras

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [10]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

In [11]:
def make_model(metrics=METRICS, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    model = keras.Sequential([
        keras.layers.Dense( units=26, input_dim=X_train.shape[-1], activation='relu' ),
        keras.layers.Dense( units=64, activation='relu' ),
        keras.layers.Dense( units=2, activation='sigmoid',bias_initializer=output_bias)
    ])

    model.compile(
      optimizer=keras.optimizers.Adam(learning_rate=0.001),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=metrics)
    
    return model

In [12]:
EPOCHS = 100
BATCH_SIZE = 2048

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_prc', verbose=1,patience=10,mode='max',restore_best_weights=True)

In [13]:
model = make_model()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 26)                702       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                1728      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 2,560
Trainable params: 2,560
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.predict(X_train[:10])

array([[0.6377045 , 0.3679998 ],
       [0.6237715 , 0.40859282],
       [0.64679104, 0.49138916],
       [0.6591388 , 0.5032412 ],
       [0.6541595 , 0.46713763],
       [0.6448782 , 0.4256291 ],
       [0.6206098 , 0.5545128 ],
       [0.6250956 , 0.4454196 ],
       [0.6345241 , 0.47593468],
       [0.64951164, 0.48962566]], dtype=float32)

In [15]:
results = model.evaluate(X_train, y_train, batch_size=BATCH_SIZE, verbose=0)
print("Loss: {:0.4f}".format(results[0]))

Loss: 0.8140
