In [1]:
import os
import pandas as pd
import time

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [16]:
import model_helper_code as hc

In [None]:
# read in datasets
datasets = os.path.join('..', 'data', 'merged_output.csv')
df = pd.read_csv(datasets)

In [None]:
# seperate statement codes into columns with binary value entries
df['Statement_codes'] = df['Statement_codes'].str.split(',')
mlb = MultiLabelBinarizer()
features = mlb.fit_transform(df['Statement_codes'])
matrix = pd.DataFrame(features, columns=mlb.classes_)

In [None]:
# split into training and testing datasets
X_train, X_test, y_train, y_test = hc.split_dataset(matrix, df['MI_Phys'], 0.33)

Training datasets: 29981 samples
Testing datasets: 14767 samples


In [7]:
# autoencoder
start_time = time.time()
input_dim = X_train.shape[1]
encoding_dim = 10 # how to determine 

input_layer = keras.Input(shape=(input_dim,))
encoded = layers.Dense(encoding_dim, activation='relu')(input_layer)
decoded = layers.Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = keras.Model(input_layer, decoded)
encoder = keras.Model(input_layer, encoded)  # feature extraction

In [8]:
# train model
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, shuffle=True, validation_data=(X_test, X_test))

# features extraction
X_train_encoded = encoder.predict(X_train)
X_test_encoded = encoder.predict(X_test)
end_time = time.time()

Epoch 1/50
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 596us/step - loss: 0.0993 - val_loss: 0.0142
Epoch 2/50
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 533us/step - loss: 0.0127 - val_loss: 0.0104
Epoch 3/50
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 540us/step - loss: 0.0097 - val_loss: 0.0084
Epoch 4/50
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 533us/step - loss: 0.0079 - val_loss: 0.0070
Epoch 5/50
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 533us/step - loss: 0.0067 - val_loss: 0.0060
Epoch 6/50
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 532us/step - loss: 0.0057 - val_loss: 0.0054
Epoch 7/50
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 510us/step - loss: 0.0052 - val_loss: 0.0049
Epoch 8/50
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 503us/step - loss: 0.0047 - val_loss: 0.0045
Epoch 9/50
[1m937/937[

In [9]:
# autoencoder training time in seconds
end_time - start_time

26.640946865081787

In [10]:
# train svm
svm_model = SVC(kernel='rbf', class_weight='balanced', probability=True)
svm_model.fit(X_train_encoded, y_train)


In [18]:
# evaluation
y_pred = svm_model.predict(X_test_encoded)
print('SVM Classification Report:')
print(classification_report(y_test, y_pred))

SVM Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.89      0.94     14707
           1       0.01      0.42      0.03        60

    accuracy                           0.88     14767
   macro avg       0.51      0.65      0.48     14767
weighted avg       0.99      0.88      0.93     14767

