## Data Munging

In [1]:
from sklearn.model_selection import train_test_split
from scipy.io import wavfile
import pandas as pd
import numpy as np

In [2]:
INPUT_LIB = '/Users/qiller/Downloads/MSBA 7011/Final pro/heartbeat-sounds/'
SAMPLE_RATE = 44100
CLASSES = ['artifact', 'normal', 'extrahls', 'murmur']
CODE_BOOK = {x:i for i,x in enumerate(CLASSES)}   
NB_CLASSES = len(CLASSES)

In [3]:
def clean_filename(fname, string):   
    file_name = fname.split('/')[1]
    if file_name[:2] == '__':        
        file_name = string + file_name
    return file_name

def load_wav_file(name, path):
    _, b = wavfile.read(path + name)
    assert _ == SAMPLE_RATE
    return b

def repeat_to_length(arr, length):
    """Repeats the numpy 1D array to given length, and makes datatype float"""
    result = np.empty((length, ), dtype = 'float32')
    l = len(arr)
    pos = 0
    while pos + l <= length:
        result[pos:pos+l] = arr
        pos += l
    if pos < length:
        result[pos:length] = arr[:length-pos]
    return result

def change_filename(fname, string='Aunlabelledtest'):
    file_name = fname.split('/')[1]
    if file_name[:2] == '__':        
        file_name = string + file_name
    return file_name

In [40]:
df = pd.read_csv(INPUT_LIB + 'set_a.csv')
df['fname'] = df['fname'].apply(clean_filename, string='Aunlabelledtest')
df['label'].fillna('unclassified')
# Load wav file in /clean_a, wav file that undergo lowpass filter
df['time_series'] = df['fname'].apply(load_wav_file, path=INPUT_LIB + 'Clean_data/')    
df['len_series'] = df['time_series'].apply(len)
MAX_LEN = max(df['len_series'])
df['time_series'] = df['time_series'].apply(repeat_to_length, length=MAX_LEN)

print(df['time_series'].values)

[array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)
 array([  -6.,   -6.,   -4., ..., -183., -183., -185.], dtype=float32)
 array([ -30.,  -31.,  -30., ..., -126., -126., -124.], dtype=float32)
 array([  0.,   0.,   0., ...,  22.,  27.,  30.], dtype=float32)
 array([   2.,    1.,    1., ...,  192.,  185.,  177.], dtype=float32)
 array([ 79.,  73.,  65., ...,   6.,  13.,  20.], dtype=float32)
 array([ 13.,  12.,  11., ..., -16., -17., -18.], dtype=float32)
 array([ 1138.,  1107.,  1074., ...,  1152.,  1191.,  1208.], dtype=float32)
 array([  0.,   0.,   0., ..., -51., -58., -62.], dtype=float32)
 array([ 63.,  59.,  55., ...,   0.,   0.,   0.], dtype=float32)
 array([ 876.,  891.,  907., ...,    7.,    7.,    6.], dtype=float32)
 array([ 2274.,  1822.,  1335., ...,  -481.,  -817., -1158.], dtype=float32)
 array([ 2.,  2.,  2., ...,  3.,  5.,  6.], dtype=float32)
 

## Split into training and testing sets

In [41]:
x_data = np.stack(df['time_series'].values, axis=0)

In [42]:
from sklearn import preprocessing
# Create a label (category) encoder object
le = preprocessing.LabelEncoder()
# Fit the encoder to the pandas column


labels = df['label'].tolist()
le.fit(labels)
labels = le.transform(labels)
labels = labels.tolist()
#print(labels)

# 0 = 'artifact'
# 1 = 'extrahls'
# 2 = 'murmur'
# 4 = 'normal'
# 3 = 'unclassified'

nx_data = []
new_labels = []
labels = np.array(labels, dtype='int')
for i in range(len(labels)):
    if labels[i] != 3:
        new_labels.append(labels[i])
        nx_data.append(x_data[i])
print(new_labels)
print(len(new_labels))      

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
124


In [46]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test=train_test_split(nx_data, new_labels, test_size=0.25)
print('Unstratified x_train, x_test, y_train and y_test are generated!')

Unstratified x_train, x_test, y_train and y_test are generated!


## Linear Discriminant Analysis Model 1: Use unstratified dataset to run 1 time

In [47]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
def lda(x_train, y_train, x_test, y_test):
    # Create Random Forest object
    lda = LinearDiscriminantAnalysis()
    # Train the model using the training sets and check score
    lda.fit(x_train, y_train)
    #Predict Output
    pred= lda.predict(x_test)
    # Apply model on testing sets
    score = lda.score(x_test, y_test)
    print('FINISHED classifying. accuracy score :\n', score)
    # Create confusion matrix
    cm = confusion_matrix(y_test, pred)
    print(cm)  
    FP = cm.sum(axis=0) - np.diag(cm)  
    FN = cm.sum(axis=1) - np.diag(cm)
    TP = np.diag(cm)
    TN = cm.sum() - (FP + FN + TP)
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP) 
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate
    FPR = FP/(FP+TN)
    # False negative rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)
    return score, TPR, TNR
    

def n_lda(epoch=1):
    acc = []
    sensitivity = []
    specificity = []
    for i in range(1,epoch+1):
        print('The %s classification result is:'%i)
        score, TPR, TNR = lda(x_train, y_train, x_test, y_test)
        acc.append(score)
        sensitivity.append(TPR)
        specificity.append(TNR)
        print('\n')
    print('The average accuracy is:', sum(acc)/len(acc),'\n')
    print('The sensitivity rate is:', sum(sensitivity)/epoch, '\n')
    print('The specificity rate is:', sum(specificity)/epoch, '\n')

In [48]:
n_lda()

The 1 classification result is:




FINISHED classifying. accuracy score :
 0.0645161290323
[[0 3 2 0]
 [1 0 2 0]
 [6 1 2 0]
 [9 3 2 0]]


The average accuracy is: 0.0645161290323 

The sensitivity rate is: [ 0.          0.          0.22222222  0.        ] 

The specificity rate is: [ 0.38461538  0.75        0.72727273  1.        ] 





## Linear Discriminant Analysis Model 2: Use stratified dataset to run 1 time

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test=train_test_split(nx_data, new_labels, test_size=0.25, stratify=new_labels)
print('Stratified x_train, x_test, y_train and y_test are generated!')

x_train, x_test, y_train and y_test are generated!


In [31]:
n_lda()

The 1 classification result is:




FINISHED classifying. accuracy score :
 0.193548387097
[[1 5 3 1]
 [1 2 2 0]
 [2 1 2 3]
 [2 2 3 1]]


The average accuracy is: 0.193548387097 

The sensitivity rate is: [ 0.1    0.4    0.25   0.125] 

The specificity rate is: [ 0.76190476  0.69230769  0.65217391  0.82608696] 



The average accuracy improved by 13%!

In [33]:
# Split the dataset again than get a new classification result
x_train, x_test, y_train, y_test=train_test_split(nx_data, new_labels, test_size=0.25, stratify=new_labels)
n_lda()

The 1 classification result is:




FINISHED classifying. accuracy score :
 0.322580645161
[[6 1 2 1]
 [2 0 1 2]
 [4 1 3 0]
 [3 1 3 1]]


The average accuracy is: 0.322580645161 

The sensitivity rate is: [ 0.6    0.     0.375  0.125] 

The specificity rate is: [ 0.57142857  0.88461538  0.73913043  0.86956522] 



The accuracy rate improved by 13%, which indicates that the classification result may depend on the dataset split situation. Since that, we plan to run the random forest model again and **shuffle the data each time**.

## LDA Model 3: Shuffle the data each time (based on model 2)

In [38]:
def shuffle_lda(x_data, y_data, epoch=1):
    acc = []
    sensitivity = []
    specificity = []
    for i in range(1,epoch+1):
        x_train, x_test, y_train, y_test=train_test_split(nx_data, new_labels, test_size=0.25, stratify=new_labels)
        score, TPR, TNR = lda(x_train, y_train, x_test, y_test)
        acc.append(score)
        sensitivity.append(TPR)
        specificity.append(TNR)
        print('\n')
    print('The average accuracy is:', sum(acc)/len(acc),'\n')
    print('The sensitivity rate is:', sum(sensitivity)/epoch, '\n')
    print('The specificity rate is:', sum(specificity)/epoch, '\n')

In [39]:
shuffle_lda(nx_data, new_labels, 10)



FINISHED classifying. accuracy score :
 0.225806451613
[[2 4 0 4]
 [1 0 0 4]
 [0 2 1 5]
 [1 3 0 4]]


FINISHED classifying. accuracy score :
 0.290322580645
[[3 0 4 3]
 [0 1 1 3]
 [0 1 1 6]
 [0 0 4 4]]


FINISHED classifying. accuracy score :
 0.193548387097
[[3 5 2 0]
 [3 2 0 0]
 [5 2 1 0]
 [4 2 2 0]]






FINISHED classifying. accuracy score :
 0.225806451613
[[3 4 0 3]
 [3 1 0 1]
 [4 2 0 2]
 [4 1 0 3]]


FINISHED classifying. accuracy score :
 0.193548387097
[[3 1 3 3]
 [2 0 1 2]
 [2 3 2 1]
 [6 1 0 1]]


FINISHED classifying. accuracy score :
 0.258064516129
[[3 4 1 2]
 [0 3 2 0]
 [1 5 2 0]
 [4 3 1 0]]


FINISHED classifying. accuracy score :
 0.387096774194
[[4 3 2 1]
 [4 0 0 1]
 [3 0 5 0]
 [3 0 2 3]]


FINISHED classifying. accuracy score :
 0.322580645161
[[4 3 2 1]
 [2 1 2 0]
 [0 3 5 0]
 [2 3 3 0]]


FINISHED classifying. accuracy score :
 0.193548387097
[[2 5 0 3]
 [3 1 0 1]
 [4 0 0 4]
 [4 1 0 3]]


FINISHED classifying. accuracy score :
 0.290322580645
[[5 3 1 1]
 [2 0 0 3]
 [5 0 0 3]
 [3 1 0 4]]


The average accuracy is: 0.258064516129 

The sensitivity rate is: [ 0.32    0.18    0.2125  0.275 ] 

The specificity rate is: [ 0.64285714  0.75        0.85652174  0.75217391] 



Therefore, the average accuracy rate is around 26%.

## LDA Model 4: Change y_data into one-hot coding (based on model 3)

In [22]:
from keras.utils import np_utils
y_data = np_utils.to_categorical(new_labels)

# Since multilabel-indicator is not supported for confusion matrix, we redefine the rf function.
def lda(x_train, y_train, x_test, y_test):
    # Create Random Forest object
    lda= RandomForestClassifier(n_estimators=1000)
    # Train the model using the training sets and check score
    lda.fit(x_train, y_train)
    #Predict Output
    pred= lda.predict(x_test)
    # Apply model on testing sets
    score = lda.score(x_test, y_test)
    print('FINISHED classifying. accuracy score :\n', score)
    # Create confusion matrix 
    #print(confusion_matrix(y_test, pred))
    return score

Using TensorFlow backend.


In [23]:
shuffle_lda(nx_data, y_data, 10)

The 1 classification result is:
FINISHED classifying. accuracy score :
 0.483870967742


The 2 classification result is:
FINISHED classifying. accuracy score :
 0.548387096774


The 3 classification result is:
FINISHED classifying. accuracy score :
 0.354838709677


The 4 classification result is:
FINISHED classifying. accuracy score :
 0.58064516129


The 5 classification result is:
FINISHED classifying. accuracy score :
 0.645161290323


The 6 classification result is:
FINISHED classifying. accuracy score :
 0.548387096774


The 7 classification result is:
FINISHED classifying. accuracy score :
 0.451612903226


The 8 classification result is:
FINISHED classifying. accuracy score :
 0.354838709677


The 9 classification result is:
FINISHED classifying. accuracy score :
 0.516129032258


The 10 classification result is:
FINISHED classifying. accuracy score :
 0.516129032258


The average accuracy is: 0.5 



After applying one-hot coding, the average accuracy rate **greatly increased by 24%**. <br/>
** Hence, model 4 is the final model for LDA part.**
