## Data Munging

In [1]:
from sklearn.model_selection import train_test_split
from scipy.io import wavfile
import pandas as pd
import numpy as np

In [2]:
INPUT_LIB = '/Users/qiller/Downloads/MSBA 7011/Final pro/heartbeat-sounds/'
SAMPLE_RATE = 44100
CLASSES = ['artifact', 'normal', 'extrahls', 'murmur']
CODE_BOOK = {x:i for i,x in enumerate(CLASSES)}   
NB_CLASSES = len(CLASSES)

In [3]:
def clean_filename(fname, string):   
    file_name = fname.split('/')[1]
    if file_name[:2] == '__':        
        file_name = string + file_name
    return file_name

def load_wav_file(name, path):
    _, b = wavfile.read(path + name)
    assert _ == SAMPLE_RATE
    return b

def repeat_to_length(arr, length):
    """Repeats the numpy 1D array to given length, and makes datatype float"""
    result = np.empty((length, ), dtype = 'float32')
    l = len(arr)
    pos = 0
    while pos + l <= length:
        result[pos:pos+l] = arr
        pos += l
    if pos < length:
        result[pos:length] = arr[:length-pos]
    return result

def change_filename(fname, string='Aunlabelledtest'):
    file_name = fname.split('/')[1]
    if file_name[:2] == '__':        
        file_name = string + file_name
    return file_name

In [4]:
df = pd.read_csv(INPUT_LIB + 'set_a.csv')
df['fname'] = df['fname'].apply(clean_filename, string='Aunlabelledtest')
df['label'].fillna('unclassified')
# Load wav file in /clean_a, wav file that undergo lowpass filter
df['time_series'] = df['fname'].apply(load_wav_file, path=INPUT_LIB + 'Clean_data/')    
df['len_series'] = df['time_series'].apply(len)
MAX_LEN = max(df['len_series'])
df['time_series'] = df['time_series'].apply(repeat_to_length, length=MAX_LEN)

print(df['time_series'].values)

[array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)
 array([  -6.,   -6.,   -4., ..., -183., -183., -185.], dtype=float32)
 array([ -30.,  -31.,  -30., ..., -126., -126., -124.], dtype=float32)
 array([  0.,   0.,   0., ...,  22.,  27.,  30.], dtype=float32)
 array([   2.,    1.,    1., ...,  192.,  185.,  177.], dtype=float32)
 array([ 79.,  73.,  65., ...,   6.,  13.,  20.], dtype=float32)
 array([ 13.,  12.,  11., ..., -16., -17., -18.], dtype=float32)
 array([ 1138.,  1107.,  1074., ...,  1152.,  1191.,  1208.], dtype=float32)
 array([  0.,   0.,   0., ..., -51., -58., -62.], dtype=float32)
 array([ 63.,  59.,  55., ...,   0.,   0.,   0.], dtype=float32)
 array([ 876.,  891.,  907., ...,    7.,    7.,    6.], dtype=float32)
 array([ 2274.,  1822.,  1335., ...,  -481.,  -817., -1158.], dtype=float32)
 array([ 2.,  2.,  2., ...,  3.,  5.,  6.], dtype=float32)
 

## Split into training and testing sets

In [5]:
x_data = np.stack(df['time_series'].values, axis=0)

In [6]:
from sklearn import preprocessing
# Create a label (category) encoder object
le = preprocessing.LabelEncoder()
# Fit the encoder to the pandas column


labels = df['label'].tolist()
le.fit(labels)
labels = le.transform(labels)
labels = labels.tolist()
#print(labels)

# 0 = 'artifact'
# 1 = 'extrahls'
# 2 = 'murmur'
# 4 = 'normal'
# 3 = 'unclassified'

nx_data = []
new_labels = []
labels = np.array(labels, dtype='int')
for i in range(len(labels)):
    if labels[i] != 3:
        new_labels.append(labels[i])
        nx_data.append(x_data[i])
print(new_labels)
print(len(new_labels))      

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
124


In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test=train_test_split(nx_data, new_labels, test_size=0.25, stratify=new_labels)
print('x_train, x_test, y_train and y_test are generated!')

x_train, x_test, y_train and y_test are generated!


## Random Forest Model: Adjust hyperparameters by GridSearchCV

In [128]:
from sklearn.model_selection import GridSearchCV

In [153]:
from keras.utils import np_utils
y_data = np_utils.to_categorical(new_labels)
x_train, x_test, y_train, y_test=train_test_split(nx_data, y_data, test_size=0.25, stratify=y_data)

In [156]:
# Process GridSearchCV
param_grid = {'n_estimators': [500,1000,2000], 'max_features': [600,650]}
rf = RandomForestClassifier()
clf = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=1, cv=10, verbose=20, scoring='accuracy')
clf.fit(x_train, y_train)

Fitting 10 folds for each of 6 candidates, totalling 60 fits
[CV] max_features=600, n_estimators=500 ..............................
[CV]  max_features=600, n_estimators=500, score=0.500000, total=   9.6s
[CV] max_features=600, n_estimators=500 ..............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.8s remaining:    0.0s


[CV]  max_features=600, n_estimators=500, score=0.600000, total=   8.7s
[CV] max_features=600, n_estimators=500 ..............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   18.7s remaining:    0.0s


[CV]  max_features=600, n_estimators=500, score=0.500000, total=   7.5s
[CV] max_features=600, n_estimators=500 ..............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   26.3s remaining:    0.0s


[CV]  max_features=600, n_estimators=500, score=0.333333, total=   7.2s
[CV] max_features=600, n_estimators=500 ..............................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   33.7s remaining:    0.0s


[CV]  max_features=600, n_estimators=500, score=0.777778, total=   7.0s
[CV] max_features=600, n_estimators=500 ..............................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   40.9s remaining:    0.0s


[CV]  max_features=600, n_estimators=500, score=0.444444, total=   6.6s
[CV] max_features=600, n_estimators=500 ..............................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   47.6s remaining:    0.0s


[CV]  max_features=600, n_estimators=500, score=0.777778, total=   7.0s
[CV] max_features=600, n_estimators=500 ..............................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   54.7s remaining:    0.0s


[CV]  max_features=600, n_estimators=500, score=0.222222, total=   6.7s
[CV] max_features=600, n_estimators=500 ..............................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.0min remaining:    0.0s


[CV]  max_features=600, n_estimators=500, score=0.555556, total=   6.6s
[CV] max_features=600, n_estimators=500 ..............................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.1min remaining:    0.0s


[CV]  max_features=600, n_estimators=500, score=0.222222, total=   6.4s
[CV] max_features=600, n_estimators=1000 .............................


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.2min remaining:    0.0s


[CV]  max_features=600, n_estimators=1000, score=0.500000, total=  13.0s
[CV] max_features=600, n_estimators=1000 .............................


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:  1.5min remaining:    0.0s


[CV]  max_features=600, n_estimators=1000, score=0.600000, total=  12.9s
[CV] max_features=600, n_estimators=1000 .............................


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  1.7min remaining:    0.0s


[CV]  max_features=600, n_estimators=1000, score=0.500000, total=  12.9s
[CV] max_features=600, n_estimators=1000 .............................


[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:  1.9min remaining:    0.0s


[CV]  max_features=600, n_estimators=1000, score=0.333333, total=  14.2s
[CV] max_features=600, n_estimators=1000 .............................


[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:  2.1min remaining:    0.0s


[CV]  max_features=600, n_estimators=1000, score=0.777778, total=  13.8s
[CV] max_features=600, n_estimators=1000 .............................


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  2.4min remaining:    0.0s


[CV]  max_features=600, n_estimators=1000, score=0.444444, total=  13.6s
[CV] max_features=600, n_estimators=1000 .............................


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:  2.6min remaining:    0.0s


[CV]  max_features=600, n_estimators=1000, score=0.888889, total=  14.9s
[CV] max_features=600, n_estimators=1000 .............................


[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:  2.9min remaining:    0.0s


[CV]  max_features=600, n_estimators=1000, score=0.333333, total=  14.6s
[CV] max_features=600, n_estimators=1000 .............................


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  3.1min remaining:    0.0s


[CV]  max_features=600, n_estimators=1000, score=0.555556, total=  18.5s
[CV] max_features=600, n_estimators=1000 .............................


[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:  3.4min remaining:    0.0s


[CV]  max_features=600, n_estimators=1000, score=0.222222, total=  13.9s
[CV] max_features=600, n_estimators=2000 .............................
[CV]  max_features=600, n_estimators=2000, score=0.500000, total=  26.9s
[CV] max_features=600, n_estimators=2000 .............................
[CV]  max_features=600, n_estimators=2000, score=0.500000, total=  24.8s
[CV] max_features=600, n_estimators=2000 .............................
[CV]  max_features=600, n_estimators=2000, score=0.500000, total=  24.3s
[CV] max_features=600, n_estimators=2000 .............................
[CV]  max_features=600, n_estimators=2000, score=0.333333, total=  24.9s
[CV] max_features=600, n_estimators=2000 .............................
[CV]  max_features=600, n_estimators=2000, score=0.777778, total=  25.6s
[CV] max_features=600, n_estimators=2000 .............................
[CV]  max_features=600, n_estimators=2000, score=0.444444, total=  24.3s
[CV] max_features=600, n_estimators=2000 ......................

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 15.9min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [500, 1000, 2000], 'max_features': [600, 650]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=20)

In [162]:
# Print the result
from sklearn.metrics import classification_report
print('The parameters of the best model are: ')
print(clf.best_params_)

y_pred = clf.predict(x_test)
print('\nThe classification report is: ')
print(classification_report(y_true=y_test, y_pred=y_pred))

The parameters of the best model are: 
{'max_features': 600, 'n_estimators': 1000}

The classification report is: 
             precision    recall  f1-score   support

          0       0.83      0.62      0.71         8
          1       0.00      0.00      0.00         3
          2       1.00      0.56      0.71         9
          3       0.00      0.00      0.00         0
          4       1.00      0.36      0.53        11

avg / total       0.86      0.45      0.58        31



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


**Therefore, the best parameters are 1000 estimators and 600 max features.**

## Random Forest Model 1: Use stratified dataset to run 10 times

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
def rf(x_train, y_train, x_test, y_test):
    # Create Random Forest object
    model= RandomForestClassifier(n_estimators=1000)
    # Train the model using the training sets and check score
    model.fit(x_train, y_train)
    #Predict Output
    pred= model.predict(x_test)
    # Apply model on testing sets
    score = model.score(x_test, y_test)
    print('FINISHED classifying. accuracy score :\n', score)
    # Create confusion matrix
    cm = confusion_matrix(y_test, pred)
    print(cm)  
    FP = cm.sum(axis=0) - np.diag(cm)  
    FN = cm.sum(axis=1) - np.diag(cm)
    TP = np.diag(cm)
    TN = cm.sum() - (FP + FN + TP)
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP) 
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate
    FPR = FP/(FP+TN)
    # False negative rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)
    return score, TPR, TNR
    

def n_rf(epoch=1):
    acc = []
    sensitivity = []
    specificity = []
    for i in range(1,epoch+1):
        print('The %s classification result is:'%i)
        score, TPR, TNR = rf(x_train, y_train, x_test, y_test)
        acc.append(score)
        sensitivity.append(TPR)
        specificity.append(TNR)
        print('\n')
    print('The average accuracy is:', sum(acc)/len(acc),'\n')
    print('The sensitivity rate is:', sum(sensitivity)/epoch, '\n')
    print('The specificity rate is:', sum(specificity)/epoch, '\n')

In [13]:
n_rf(epoch = 10)

The 1 classification result is:
FINISHED classifying. accuracy score :
 0.387096774194
[[6 0 4 0]
 [3 0 2 0]
 [0 0 6 2]
 [3 0 5 0]]


The 2 classification result is:




FINISHED classifying. accuracy score :
 0.387096774194
[[6 0 4 0]
 [3 0 2 0]
 [1 0 6 1]
 [3 0 5 0]]


The 3 classification result is:




FINISHED classifying. accuracy score :
 0.387096774194
[[6 0 4 0]
 [3 0 2 0]
 [1 0 6 1]
 [3 0 5 0]]


The 4 classification result is:




FINISHED classifying. accuracy score :
 0.387096774194
[[6 0 4 0]
 [3 0 2 0]
 [1 0 6 1]
 [3 0 5 0]]


The 5 classification result is:




FINISHED classifying. accuracy score :
 0.387096774194
[[6 0 4 0]
 [3 0 2 0]
 [2 0 6 0]
 [3 0 5 0]]


The 6 classification result is:




FINISHED classifying. accuracy score :
 0.387096774194
[[6 0 4 0]
 [2 0 2 1]
 [1 0 6 1]
 [3 0 5 0]]


The 7 classification result is:




FINISHED classifying. accuracy score :
 0.354838709677
[[5 0 4 1]
 [3 0 2 0]
 [1 0 6 1]
 [3 0 5 0]]


The 8 classification result is:




FINISHED classifying. accuracy score :
 0.387096774194
[[6 0 4 0]
 [1 0 2 2]
 [1 0 6 1]
 [3 0 5 0]]


The 9 classification result is:




FINISHED classifying. accuracy score :
 0.387096774194
[[6 0 4 0]
 [3 0 1 1]
 [1 0 6 1]
 [3 0 5 0]]


The 10 classification result is:




FINISHED classifying. accuracy score :
 0.387096774194
[[6 0 4 0]
 [0 0 3 2]
 [1 0 6 1]
 [3 0 5 0]]


The average accuracy is: 0.383870967742 

The sensitivity rate is: [ 0.59  0.    0.75  0.  ] 

The specificity rate is: [ 0.6952381   1.          0.52173913  0.92608696] 





In [14]:
# Split the dataset again than get a new classification result
x_train, x_test, y_train, y_test=train_test_split(nx_data, new_labels, test_size=0.25, stratify=new_labels)
n_rf(epoch = 10)

The 1 classification result is:
FINISHED classifying. accuracy score :
 0.451612903226
[[6 0 3 1]
 [5 0 0 0]
 [1 0 7 0]
 [5 0 2 1]]


The 2 classification result is:




FINISHED classifying. accuracy score :
 0.451612903226
[[6 0 2 2]
 [5 0 0 0]
 [1 0 7 0]
 [5 0 2 1]]


The 3 classification result is:




FINISHED classifying. accuracy score :
 0.483870967742
[[8 0 2 0]
 [5 0 0 0]
 [1 0 7 0]
 [6 0 2 0]]


The 4 classification result is:




FINISHED classifying. accuracy score :
 0.516129032258
[[7 0 3 0]
 [5 0 0 0]
 [1 0 7 0]
 [4 0 2 2]]


The 5 classification result is:




FINISHED classifying. accuracy score :
 0.451612903226
[[5 0 3 2]
 [5 0 0 0]
 [1 0 7 0]
 [4 0 2 2]]


The 6 classification result is:




FINISHED classifying. accuracy score :
 0.41935483871
[[6 0 3 1]
 [5 0 0 0]
 [1 0 6 1]
 [5 0 2 1]]


The 7 classification result is:




FINISHED classifying. accuracy score :
 0.516129032258
[[7 0 2 1]
 [5 0 0 0]
 [1 0 7 0]
 [4 0 2 2]]


The 8 classification result is:




FINISHED classifying. accuracy score :
 0.516129032258
[[7 0 3 0]
 [5 0 0 0]
 [1 0 7 0]
 [4 0 2 2]]


The 9 classification result is:




FINISHED classifying. accuracy score :
 0.451612903226
[[6 0 3 1]
 [5 0 0 0]
 [1 0 7 0]
 [5 0 2 1]]


The 10 classification result is:




FINISHED classifying. accuracy score :
 0.483870967742
[[7 0 3 0]
 [5 0 0 0]
 [1 0 7 0]
 [5 0 2 1]]


The average accuracy is: 0.474193548387 

The sensitivity rate is: [ 0.65    0.      0.8625  0.1625] 

The specificity rate is: [ 0.49047619  1.          0.79565217  0.96086957] 





The accuracy rate improved by 9%, which indicates that the classification result may depend on the dataset split situation. Since that, we plan to run the random forest model again and **shuffle the data each time**.

## Random Forest Model 2: Shuffle the data each time (based on model 1)

In [15]:
def shuffle_rf(x_data, y_data, epoch=1):
    acc = []
    sensitivity = []
    specificity = []
    for i in range(1,epoch+1):
        x_train, x_test, y_train, y_test=train_test_split(x_data, y_data, test_size=0.25, stratify=y_data)
        print('The %s classification result is:'%i)
        score, TPR, TNR = rf(x_train, y_train, x_test, y_test)
        acc.append(score)
        sensitivity.append(TPR)
        specificity.append(TNR)
        print('\n')
    print('The average accuracy is:', sum(acc)/len(acc),'\n')
    print('The sensitivity rate is:', sum(sensitivity)/epoch, '\n')
    print('The specificity rate is:', sum(specificity)/epoch, '\n')

In [16]:
shuffle_rf(nx_data, new_labels, 10)

The 1 classification result is:
FINISHED classifying. accuracy score :
 0.387096774194
[[7 0 3 0]
 [5 0 0 0]
 [2 1 5 0]
 [4 0 4 0]]


The 2 classification result is:




FINISHED classifying. accuracy score :
 0.41935483871
[[5 0 2 3]
 [2 0 2 1]
 [1 0 7 0]
 [5 0 2 1]]


The 3 classification result is:




FINISHED classifying. accuracy score :
 0.451612903226
[[8 0 1 1]
 [1 0 2 2]
 [2 0 6 0]
 [7 0 1 0]]


The 4 classification result is:




FINISHED classifying. accuracy score :
 0.451612903226
[[5 0 4 1]
 [5 0 0 0]
 [0 0 8 0]
 [4 0 3 1]]


The 5 classification result is:




FINISHED classifying. accuracy score :
 0.354838709677
[[4 0 5 1]
 [3 0 1 1]
 [2 0 5 1]
 [4 0 2 2]]


The 6 classification result is:




FINISHED classifying. accuracy score :
 0.387096774194
[[5 0 5 0]
 [2 0 1 2]
 [1 0 7 0]
 [2 0 6 0]]


The 7 classification result is:




FINISHED classifying. accuracy score :
 0.483870967742
[[7 0 3 0]
 [2 0 3 0]
 [0 0 8 0]
 [4 0 4 0]]


The 8 classification result is:




FINISHED classifying. accuracy score :
 0.451612903226
[[6 0 4 0]
 [4 0 0 1]
 [0 0 7 1]
 [6 0 1 1]]


The 9 classification result is:




FINISHED classifying. accuracy score :
 0.483870967742
[[7 0 3 0]
 [1 0 4 0]
 [0 0 8 0]
 [4 0 4 0]]


The 10 classification result is:




FINISHED classifying. accuracy score :
 0.354838709677
[[6 0 3 1]
 [3 0 1 1]
 [1 0 5 2]
 [7 0 1 0]]


The average accuracy is: 0.422580645161 

The sensitivity rate is: [ 0.6     0.      0.825   0.0625] 

The specificity rate is: [ 0.6         0.99615385  0.67391304  0.9173913 ] 





Therefore, the average accuracy rate is around 42%.

## Random Forest Model 3: Change y_data into one-hot coding (based on model 2)

In [19]:
from keras.utils import np_utils
y_data = np_utils.to_categorical(new_labels)

# Since multilabel-indicator is not supported for confusion matrix, we redefine the rf function.
def rf(x_train, y_train, x_test, y_test):
    # Create Random Forest object
    model= RandomForestClassifier(n_estimators=1000)
    # Train the model using the training sets and check score
    model.fit(x_train, y_train)
    #Predict Output
    pred= model.predict(x_test)
    # Apply model on testing sets
    score = model.score(x_test, y_test)
    print('FINISHED classifying. accuracy score :\n', score)
    # Create confusion matrix 
    #print(confusion_matrix(y_test, pred))
    return score

def shuffle_rf(x_data, y_data, epoch=1):
    acc = []
    for i in range(1,epoch+1):
        x_train, x_test, y_train, y_test=train_test_split(x_data, y_data, test_size=0.25, stratify=y_data)
        print('The %s classification result is:'%i)
        score = rf(x_train, y_train, x_test, y_test)
        acc.append(score)
        print('\n')
    print('The average accuracy is:', sum(acc)/len(acc),'\n')

In [20]:
shuffle_rf(nx_data, y_data, 10)

The 1 classification result is:
FINISHED classifying. accuracy score :
 0.483870967742


The 2 classification result is:
FINISHED classifying. accuracy score :
 0.451612903226


The 3 classification result is:
FINISHED classifying. accuracy score :
 0.483870967742


The 4 classification result is:
FINISHED classifying. accuracy score :
 0.548387096774


The 5 classification result is:
FINISHED classifying. accuracy score :
 0.41935483871


The 6 classification result is:
FINISHED classifying. accuracy score :
 0.451612903226


The 7 classification result is:
FINISHED classifying. accuracy score :
 0.58064516129


The 8 classification result is:
FINISHED classifying. accuracy score :
 0.483870967742


The 9 classification result is:
FINISHED classifying. accuracy score :
 0.387096774194


The 10 classification result is:
FINISHED classifying. accuracy score :
 0.58064516129


The average accuracy is: 0.487096774194 



After applying one-hot coding, the average accuracy rate **increased by 7%**. <br/>

Reasons from the author of xgboost **tqchen**: <br/>
Indeed there is not a unified way handling categorical features in trees. <br/>
If you want ordered variables, you can transform the variables into numerical levels(say age). Or if you prefer treat it as categorical variable, do one hot encoding. One-hot encoding could be helpful when the number of categories are small( in level of 10 to 100). In such case one-hot encoding can discover interesting interactions, while ordering them makes it harder to be discovered. <br/>



Referrence: <br/>
http://d0evi1.com/onehot/ <br/>
https://github.com/szilard/benchm-ml/issues/1