In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import sklearn
import pickle
from time import time
import warnings

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelBinarizer, LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.mixture import BayesianGaussianMixture
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score

from scipy.stats import expon, randint

np.random.seed(17)
warnings.filterwarnings('ignore')

In [2]:
# Cordectomy 59
# Dysodie 56
# Dysphonia 101
# Hyperfunctional_dysphonia 211
# Hypofunctional_dysphonia 16
# Hypotonic_dysphonia 5
# Laryngitis 140
# Polyp 45
# Reinke_edema 68
# Spasmodic_dysphonia 64
# VC_carcinoma 22
# VFP 211
# Healthy sample size: 634
# Choose Dysphonia, Hyperfunctional_dysphonia, Laryngitis, VFP

# pathology_names = ['Cordectomy', 'Dysodie', 'Dysphonia', 'Hyperfunctional_dysphonia',
#                    'Hypofunctional_dysphonia', 'Hypotonic_dysphonia', 'Laryngitis', 'Polyp', 'Reinke_edema',
#                    'Spasmodic_dysphonia', 'VC_carcinoma', 'VFP']

pathology_names = ['Dysphonia', 'Hyperfunctional_dysphonia', 'Laryngitis', 'VFP']
audio_contents = ['i', 'a', 'u', 'phrase']
pitchs = ['h', 'l', 'n', 'lhl']

# Use classic machine learning methods

In [3]:
class run_multiple_classifiers():
    def __init__(self, features, labels):
        self.X = features
        self.le = LabelEncoder()
        self.y = self.le.fit_transform(labels)
        self.best_results = {}
        self.classifiers = []
        self.best_estimator = None
        self.time_cost = 0.0
        
    def _svm(self):
        params_dist = {'classifier__gamma': expon(scale=0.1),
                       'classifier__C': expon(scale=10)}
        model = SVC()
        name = 'SVM'
        self.classifiers.append((params_dist, model, name))
    
    def _rf(self):
        params_dist = {'classifier__max_features': randint(1,5),
                       'classifier__max_depth': [3, None],
                       'classifier__min_samples_split': randint(2, 11),
                       'classifier__min_samples_leaf': randint(1, 11),
                       'classifier__bootstrap': [True, False]}
        model = RandomForestClassifier()
        name = 'RandomForest'
        self.classifiers.append((params_dist, model, name))
        
    def _logisticReg(self):
        params_dist = {'classifier__C': expon(scale=1)}
        model = LogisticRegression()
        name = 'LogisticRegression'
        self.classifiers.append((params_dist, model, name))
        
    def _gaussianMix(self):
        params_dist = {'classifier__n_components': randint(1,10),
                      'classifier__weight_concentration_prior': expon(0.1),
                      'classifier__covariance_type' : ['full', 'tied', 'diag', 'spherical']}
        model = BayesianGaussianMixture()
        name = 'BayesianGaussianMixture'
        self.classifiers.append((params_dist, model, name))
        
    def _kNN(self):
        params_dist = {'classifier__n_neighbors': randint(2,15),
                      'classifier__weights': ['uniform', 'distance']}
        model = KNeighborsClassifier()
        name = 'kNN'
        self.classifiers.append((params_dist, model, name))
        
    def _MLP(self):
        params_dist = {'classifier__hidden_layer_sizes': [(5,5), (5,5,3), (4,4,4)],
                      'classifier__alpha': expon(0.0001)}
        model = MLPClassifier()
        name = 'MLP'
        self.classifiers.append((params_dist, model, name))
        
    def _DT(self):
        params_dist = {'classifier__criterion':['gini', 'entropy'],
                     'classifier__min_samples_split':randint(2,10)}
        model = DecisionTreeClassifier()
        name = 'DecisionTree'
        self.classifiers.append((params_dist, model, name))
    
    def run_classifier(self):
        start = time()
        self._svm()
        self._rf()
        self._logisticReg()
        self._gaussianMix()
        self._kNN()
#         self._MLP()
        self._DT()
        clfs = self.classifiers.copy()
        best_of_all = 0.0
        for pd, model, name in clfs:
            fs_pd = {'featureSelection__k': randint(self.X.shape[1]//10, self.X.shape[1]//2)}
            pipe = Pipeline([('scaler',StandardScaler()),('featureSelection',SelectKBest()),('classifier',model)])
            params_dist = {**fs_pd, **pd}
            grid = RandomizedSearchCV(pipe, param_distributions=params_dist, n_iter=20, cv=5, n_jobs=-1, random_state=17, scoring='accuracy')
            grid.fit(self.X, self.y)
            self.best_results[name] = float('{0:.3f}'.format(grid.best_score_))
            if grid.best_score_>best_of_all:
                best_of_all = grid.best_score_
                self.best_estimator = grid.best_estimator_
        for pd, model, name in self.classifiers:
            pipe = Pipeline([('scaler',StandardScaler()),('featureSelection',PCA(n_components='mle', svd_solver='full')),('classifier',model)])
            grid = RandomizedSearchCV(pipe, param_distributions=pd, n_iter=20, cv=5, n_jobs=-1, random_state=17, scoring='accuracy')
            grid.fit(self.X, self.y)
            if grid.best_score_>self.best_results[name]:
                self.best_results[name] = float('{0:.3f}'.format(grid.best_score_))
                if grid.best_score_>best_of_all:
                    best_of_all = grid.best_score_
                    self.best_estimator = grid.best_estimator_
        self.time_cost = time()-start
            
    def display_results(self):
        print(self.best_results)
        print('Time cost is: {0:.2f}s'.format(self.time_cost))
        
    def save_best_model(self, name):
        joblib.dump(self.best_estimator, name)

In [4]:
class audio_data:
    def __init__(self, name=''):
        self.name = name
        self.i = {}
        self.a = {}
        self.u = {}
        self.phrase = []

class AudioFeature:
    def __init__(self, audio_files, n_mfcc=20):
        self.name = audio_files.name
        self.phrase = audio_files.phrase
        self.sample_size = len(audio_files.phrase)
        self.i = audio_files.i
        self.a = audio_files.a
        self.u = audio_files.u
        self.n_mfcc = n_mfcc
        self.p_mfcc = []
        self.iau_mfcc = []
        self.labels = []
        self.extract_mfcc_features(self.n_mfcc)
        self.create_labels()
        
    def extract_mfcc_features(self, n_mfcc):
        for p_file in self.phrase:
            mfccs = self.extract_mfcc(p_file, n_mfcc)
            self.p_mfcc.append(mfccs)
        for i in range(self.sample_size):
            mfcc_vector = np.array([])
            for content in (self.i, self.a, self.u):
                for pitch in pitchs:
                    mfccs = self.extract_mfcc(content[pitch][i], n_mfcc)
                    mfcc_vector = np.hstack((mfcc_vector,mfccs))
            self.iau_mfcc.append(mfcc_vector)
        self.p_mfcc = np.array(self.p_mfcc)
        self.iau_mfcc = np.array(self.iau_mfcc)
        
    def create_labels(self):
        self.labels = np.array([self.name]*self.sample_size)
            
    def extract_mfcc(self, file, n_mfcc):
        data, sr = librosa.load(file)
        mfccs = librosa.feature.mfcc(data, sr, n_mfcc=n_mfcc)
        return np.mean(mfccs.T, axis=0)

## Use mfcc features: n_mfcc=4

In [11]:
with open('mfcc_{n_mfcc}/{name}_mfcc_{n_mfcc}.pkl'.format(name='healthy',n_mfcc=4),'rb') as f:
    h_mfcc4 = pickle.load(f)

h_iau_mfcc4 = h_mfcc4.iau_mfcc
h_label = h_mfcc4.labels

h_iau_mfcc4.shape, h_label.shape

In [12]:
n_mfcc = 4
mfccs = []
for p_name in pathology_names:
    with open('mfcc_{n_mfcc}/{name}_mfcc_{n_mfcc}.pkl'.format(name=p_name,n_mfcc=n_mfcc),'rb') as f:
        p_mfcc = pickle.load(f)
    p_iau_mfcc = p_mfcc.iau_mfcc
    mfccs.append(p_iau_mfcc)

In [13]:
p_iau_mfcc4 = np.concatenate(mfccs)
p_label = np.array(['pathology']*p_iau_mfcc4.shape[0])
p_iau_mfcc4.shape, p_label.shape

((663, 48), (663,))

In [14]:
mfcc4_all = np.concatenate((h_iau_mfcc4, p_iau_mfcc4))
label_all = np.concatenate((h_label, p_label))
mfcc4_all.shape, label_all.shape

((1297, 48), (1297,))

In [28]:
run1_mfcc4 = run_multiple_classifiers(mfcc4_all, label_all)

run1_mfcc4.run_classifier()
run1_mfcc4.display_results()

In [29]:
run1_mfcc4.best_estimator

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('featureSelection', PCA(copy=True, iterated_power='auto', n_components='mle', random_state=None,
  svd_solver='full', tol=0.0, whiten=False)), ('classifier', BayesianGaussianMixture(covariance_prior=None, covariance_type='...ration_prior=0.27311456500103126,
            weight_concentration_prior_type='dirichlet_process'))])

## Use mfcc features: n_mfcc=12

In [5]:
n_mfcc=12
with open('mfcc_{n_mfcc}/{name}_mfcc_{n_mfcc}.pkl'.format(name='healthy',n_mfcc=n_mfcc),'rb') as f:
    h_mfcc12 = pickle.load(f)

h_iau_mfcc12 = h_mfcc12.iau_mfcc
h_label = h_mfcc12.labels

h_iau_mfcc12.shape, h_label.shape

((634, 144), (634,))

In [6]:
n_mfcc = 12
mfccs = []
for p_name in pathology_names:
    with open('mfcc_{n_mfcc}/{name}_mfcc_{n_mfcc}.pkl'.format(name=p_name,n_mfcc=n_mfcc),'rb') as f:
        p_mfcc = pickle.load(f)
    p_iau_mfcc = p_mfcc.iau_mfcc
    mfccs.append(p_iau_mfcc)
p_iau_mfcc12 = np.concatenate(mfccs)
p_label = np.array(['pathology']*p_iau_mfcc12.shape[0])
p_iau_mfcc12.shape, p_label.shape

((663, 144), (663,))

In [7]:
mfcc12_all = np.concatenate((h_iau_mfcc12, p_iau_mfcc12))
label_all = np.concatenate((h_label, p_label))
mfcc12_all.shape, label_all.shape

((1297, 144), (1297,))

In [None]:
run1_mfcc12 = run_multiple_classifiers(mfcc12_all, label_all)

run1_mfcc12.run_classifier()
run1_mfcc12.display_results()

# Use deep learning approach

In [25]:
import keras
from keras.layers import Dense, Dropout
from keras import Sequential
from keras.callbacks import EarlyStopping
from keras import backend

In [58]:
backend.clear_session()
model = Sequential()
model.add(Dropout(0.1, input_shape=(48,)))
# model.add(Dense(30, activation='relu'))
# model.add(Dropout(0.1))
model.add(Dense(20, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [59]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_1 (Dropout)          (None, 48)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                980       
_________________________________________________________________
dropout_2 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 1,201
Trainable params: 1,201
Non-trainable params: 0
_________________________________________________________________


In [60]:
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

In [23]:
scaler = StandardScaler()
mfcc4_all_scaled = scaler.fit_transform(mfcc4_all)

In [15]:
le = LabelEncoder()
labels = le.fit_transform(label_all)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(mfcc4_all_scaled, labels, test_size=0.2, random_state=17)

In [66]:
model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), epochs=50, callbacks=[EarlyStopping(patience=20)])

Train on 1037 samples, validate on 260 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50


<keras.callbacks.History at 0x2f5f39b0>