In [2]:
import numpy as np
import pandas as pd
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [3]:
df_features = pd.read_csv("features_df.csv")

In [4]:
df_features.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,150,151,152,153,154,155,156,157,158,159
0,0,0.861475,0.922338,0.832785,0.885525,0.828158,0.793287,0.824301,0.732386,0.69771,...,-10.530272,-9.13786,-8.667274,-1.80808,-6.800573,-6.219642,0.257486,1.497342,-3.951734,-2.955382
1,1,0.76387,0.852937,0.828722,0.852716,0.848668,0.843809,0.836646,0.762158,0.727656,...,-14.29419,-15.456995,-9.299699,-3.360145,-3.474428,-0.281338,-4.229411,-1.9665,4.157502,1.135244
2,2,0.939885,0.766322,0.597016,0.664217,0.857982,0.649231,0.633266,0.593182,0.655766,...,-10.39899,-10.941664,-12.466786,-1.78235,-0.597224,-5.18738,-0.188078,2.20937,-5.275289,-4.154563
3,3,0.787585,0.778726,0.75523,0.749415,0.757591,0.876939,0.932356,0.875719,0.791082,...,-9.592726,-3.938925,-5.946343,-3.800183,-0.003994,1.67636,0.155179,0.73204,-5.744483,-1.820551
4,4,0.713702,0.668932,0.653729,0.692693,0.79341,0.953053,0.908813,0.807453,0.712799,...,-10.074761,-4.548989,-6.987258,-1.930489,-0.286458,4.609173,0.296392,0.992723,-5.606578,-5.110719


In [6]:
df_target = pd.read_csv("target_df.csv")

In [8]:
df_target.tail()

Unnamed: 0.1,Unnamed: 0,0
4165,4165,normal
4166,4166,normal
4167,4167,normal
4168,4168,normal
4169,4169,normal


In [16]:
df_target["normality"] = df_target.iloc[:, 1].replace(('normal', 'abnormal'), (0, 1))

In [17]:
df_target.head()

Unnamed: 0.1,Unnamed: 0,0,normality
0,0,abnormal,1
1,1,abnormal,1
2,2,abnormal,1
3,3,abnormal,1
4,4,abnormal,1


In [33]:
df_target.drop(df_target.columns[0], axis=1 , inplace=True)

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [25]:
df_target.head()

Unnamed: 0,0,normality
0,abnormal,1
1,abnormal,1
2,abnormal,1
3,abnormal,1
4,abnormal,1


In [26]:
df_target.tail()

Unnamed: 0,0,normality
4165,normal,0
4166,normal,0
4167,normal,0
4168,normal,0
4169,normal,0


In [27]:
df_features.drop(df_features.columns[0], axis=1 , inplace=True)

In [28]:
df_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,150,151,152,153,154,155,156,157,158,159
0,0.861475,0.922338,0.832785,0.885525,0.828158,0.793287,0.824301,0.732386,0.69771,0.660165,...,-10.530272,-9.13786,-8.667274,-1.80808,-6.800573,-6.219642,0.257486,1.497342,-3.951734,-2.955382
1,0.76387,0.852937,0.828722,0.852716,0.848668,0.843809,0.836646,0.762158,0.727656,0.718589,...,-14.29419,-15.456995,-9.299699,-3.360145,-3.474428,-0.281338,-4.229411,-1.9665,4.157502,1.135244
2,0.939885,0.766322,0.597016,0.664217,0.857982,0.649231,0.633266,0.593182,0.655766,0.7639,...,-10.39899,-10.941664,-12.466786,-1.78235,-0.597224,-5.18738,-0.188078,2.20937,-5.275289,-4.154563
3,0.787585,0.778726,0.75523,0.749415,0.757591,0.876939,0.932356,0.875719,0.791082,0.738816,...,-9.592726,-3.938925,-5.946343,-3.800183,-0.003994,1.67636,0.155179,0.73204,-5.744483,-1.820551
4,0.713702,0.668932,0.653729,0.692693,0.79341,0.953053,0.908813,0.807453,0.712799,0.653515,...,-10.074761,-4.548989,-6.987258,-1.930489,-0.286458,4.609173,0.296392,0.992723,-5.606578,-5.110719


In [29]:
df_features.shape

(4170, 160)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    df_features, 
    df_target["normality"], 
    test_size=0.2, 
    random_state=69
)

In [35]:
classification_models = [
    KNeighborsClassifier(),#(3),
    SVC(kernel='linear'),#, C=0.025),
    SVC(kernel='rbf'),
    DecisionTreeClassifier(),#max_depth=5),
    RandomForestClassifier(),#max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

scores = []
for model in classification_models:
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    model_name = type(model).__name__
    if model_name=='SVC' and model.kernel=='rbf': model_name+=' RBF kernel'
    scores.append((model_name,(f'{100*score:.2f}%')))
# Make it pretty
scores_df = pd.DataFrame(scores,columns=['Classifier','Accuracy Score'])
scores_df.sort_values(by='Accuracy Score',axis=0,ascending=False)



Unnamed: 0,Classifier,Accuracy Score
5,AdaBoostClassifier,99.64%
4,RandomForestClassifier,99.04%
3,DecisionTreeClassifier,97.84%
7,QuadraticDiscriminantAnalysis,96.40%
0,KNeighborsClassifier,88.25%
1,SVC,86.69%
2,SVC RBF kernel,86.69%
6,GaussianNB,22.42%


In [36]:
from sklearn.svm import SVC

model = SVC(
    C=10,
    gamma='auto',
    kernel='rbf',
    random_state=69
)

model.fit(X_train, y_train)

print(f'SVC Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'SVC Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%')

SVC Model's accuracy on training set is 99.91%
SVC Model's accuracy on test set is 89.45%


In [37]:
from sklearn.neighbors import KNeighborsClassifier

####### Default kNN  ########
model = KNeighborsClassifier(
)

model.fit(X_train, y_train)

print(f'Default kNN Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'Default kNN Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%\n')

##### (hastily) tuned kNN ######
model = KNeighborsClassifier(
    n_neighbors = 5,
    weights = 'distance',
    algorithm = 'brute',
    leaf_size = '30',
    n_jobs=4
)

model.fit(X_train, y_train)

print(f'kNN Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'kNN Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%')

Default kNN Model's accuracy on training set is 92.12%
Default kNN Model's accuracy on test set is 88.25%

kNN Model's accuracy on training set is 100.00%
kNN Model's accuracy on test set is 88.25%


In [38]:
from sklearn.ensemble import RandomForestClassifier
####### Default Random Forest ########
model = RandomForestClassifier(
    random_state=69
)

model.fit(X_train, y_train)

print(f'Default Random Forest Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'Default Random Forest Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%\n')


########## Tuned Random Forest #######
model = RandomForestClassifier(
    n_estimators = 500, 
    criterion ='entropy',
    warm_start = True,
    max_features = 'sqrt',
    oob_score = 'True', # more on this below
    random_state=69  
) 

model.fit(X_train, y_train)

print(f'Random Forest Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'Random Forest Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%')

Default Random Forest Model's accuracy on training set is 100.00%
Default Random Forest Model's accuracy on test set is 99.40%

Random Forest Model's accuracy on training set is 100.00%
Random Forest Model's accuracy on test set is 99.52%
