In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mglearn
import random
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import preprocessing, neighbors, naive_bayes, tree, ensemble, svm, neural_network, metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay, RocCurveDisplay, make_scorer
from sklearn.model_selection import KFold ,cross_val_score
from keras.callbacks import EarlyStopping
import tensorflow as tf
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from IPython.display import Audio
import visualkeras
from tensorflow.keras.utils import plot_model
import time

In [None]:
def k_fold(X, model, k=10, random_state=None):
    kf = KFold(n_splits=k, random_state=random_state)
    acc_score = []
    error_rate = []

    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]

        model.fit(X_train,y_train)
        pred_values = model.predict(X_test)
        
        error_rate.append(np.mean(pred_values != y_test))

        acc = accuracy_score(pred_values , y_test)
        acc_score.append(acc)

    avg_acc_score = sum(acc_score)/k
    avg_error_rate = sum(error_rate)/k
    
    return avg_acc_score, acc_score, avg_error_rate, error_rate

In [None]:
def plot_avg_error_rate(error_rates, image_name, labels=None):
    if labels is None:
        labels = [i for i in range(len(error_rates))]
        
    plt.plot(labels, error_rates, color='blue', linestyle='solid', marker='o', markerfacecolor='red', markersize=8)
    plt.title('Error Rate vs. K Value')
    plt.xlabel('K')
    plt.xticks([i for i in range(len(error_rates))])
    plt.ylabel('Error Rate')
    plt.savefig('figs/' + image_name)
    plt.tight_layout()
    plt.show()

### Dataset

In [None]:
LibriSpeech = pd.read_csv('LibriSpeech.csv')
LibriSpeech = LibriSpeech.drop(['recording', 'id'], axis=1)
LibriSpeech.describe()

### Visualization

In [None]:
correlation = LibriSpeech.corr()
matrix = np.triu(correlation)

fig=plt.gcf()
fig.set_size_inches(30,15)
sns.heatmap(data=correlation, annot=True, mask=matrix)
plt.show()

In [None]:
male = LibriSpeech.loc[LibriSpeech['label']=='male']
female = LibriSpeech.loc[LibriSpeech['label']=='female']
fig, axes = plt.subplots(10, 2, figsize=(15,25))
ax = axes.ravel()
for i in range(20):
    ax[i].hist(male.iloc[:,i], bins=20, color=mglearn.cm3(0), alpha=.5)
    ax[i].hist(female.iloc[:, i], bins=20, color=mglearn.cm3(2), alpha=.5)
    ax[i].set_title(list(male)[i])
    ax[i].set_yticks(())
    ax[i].set_xlabel("Feature magnitude")
    ax[i].set_ylabel("Frequency")
    ax[i].legend(["male", "female"], loc="best")

fig.tight_layout()

### Best features

In [None]:
def select_kbest_clf(data_frame, target, k=5):
    feat_selector = SelectKBest(f_classif, k=k)
    _ = feat_selector.fit(data_frame.drop(target, axis=1), data_frame[target])
    
    feat_scores = pd.DataFrame()
    feat_scores["F Score"] = feat_selector.scores_
    feat_scores["P Value"] = feat_selector.pvalues_
    feat_scores["Support"] = feat_selector.get_support()
    feat_scores["Attribute"] = data_frame.drop(target, axis=1).columns
    
    return feat_scores

In [None]:
LibriBest = select_kbest_clf(LibriSpeech, 'label', k=5).sort_values(['F Score'],ascending=False)
LibriBest

In [None]:
plt.figure(figsize=(12, 10))
k1=sns.barplot(x=LibriBest['F Score'],y=LibriBest['Attribute'])
k1.set_title('Feature Importance')
plt.savefig('figs/feature_importance')
plt.show()

In [None]:
best_features= LibriBest['Attribute'].iloc[:10].values

### Data preparation

In [None]:
target_names = ['female', 'male']
LibriSpeech.label.value_counts()

In [None]:
x_data = LibriSpeech[best_features]
y = LibriSpeech.label.values

In [None]:
X = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
X.head()

### Logistic regression

In [None]:
log_reg = LogisticRegression(max_iter=1000)
start_time = time.time()
avg_accuracy, _, avg_error_rate, _ = k_fold(X, log_reg)
print("--- %s seconds ---" % (time.time() - start_time))
avg_accuracy, avg_error_rate

### KNN Classifier

In [None]:
error_rates = []
for i in range(1, 20):
    knn_model = neighbors.KNeighborsClassifier(n_neighbors = i)
    avg_acc, _, avg_error_rate, _ = k_fold(X, knn_model)
    error_rates.append(avg_error_rate)

In [None]:
best_k_value = np.argmin(error_rates) + 1
best_k_value

In [None]:
plot_avg_error_rate(error_rates, 'knn_error_rate', ['' + str(i) + '' for i in range(1, 20)])

In [None]:
knn_model = neighbors.KNeighborsClassifier(n_neighbors = best_k_value)
start_time = time.time()
avg_acc, _, avg_error_rate, _ = k_fold(X, knn_model)
print("--- %s seconds ---" % (time.time() - start_time))
avg_acc, avg_error_rate

### Naive  Bayes

In [None]:
nb_model = naive_bayes.GaussianNB()
start_time = time.time()
avg_acc, _, avg_error_rate, _ = k_fold(X, nb_model)
print("--- %s seconds ---" % (time.time() - start_time))
avg_acc, avg_error_rate

### Decision tree

In [None]:
error_rates = []
for i in range(2, 21):
    dt_model = tree.DecisionTreeClassifier(max_leaf_nodes=i)
    _, _, avg_error_rate, _ = k_fold(X, dt_model)
    error_rates.append(avg_error_rate)

In [None]:
best_leaf_value = np.argmin(error_rates)
best_leaf_value = [i for i in range(2, 21)][best_leaf_value]

In [None]:
plot_avg_error_rate(error_rates, 'dt_error_rate', ['' + str(i) + '' for i in range(2, 21)])

In [None]:
dt_model = tree.DecisionTreeClassifier(criterion = 'gini', max_leaf_nodes=best_leaf_value)
start_time = time.time()
avg_acc, _, avg_error_rate, _ = k_fold(X, dt_model)
print("--- %s seconds ---" % (time.time() - start_time))
avg_acc, avg_error_rate

In [None]:
plt.figure(figsize = (10, 10))
tree.plot_tree(dt_model, feature_names=best_features)
plt.savefig('figs/dt_model')
plt.show()

### Random Forest

In [None]:
error_rates = []
for i in range(1, 101, 20):
    rf_model = ensemble.RandomForestClassifier(n_estimators = i)
    avg_acc, _, avg_error_rate, _ = k_fold(X, rf_model)
    error_rates.append(avg_error_rate)

In [None]:
best_n_value = np.argmin(error_rates)
best_n_value = error_rates[best_n_value]

In [None]:
labels=['' + str(i) + '' for i in range(1, 101, 20)]
labels

In [None]:
plot_avg_error_rate(error_rates, 'rf_error_rate', labels)

In [None]:
rf_model = ensemble.RandomForestClassifier(n_estimators = best_n_value)
start_time = time.time()
avg_acc, _, avg_error_rate, _ = k_fold(X, rf_model)
print("--- %s seconds ---" % (time.time() - start_time))
avg_acc, avg_error_rate

### Support Vector Machine

In [None]:
kernels = ['rbf', 'poly', 'linear']
error_rates = []
for i in kernels:
    print(i)
    svc=svm.SVC(kernel=i)
    _, _, avg_error_rate, _ = k_fold(X, svc)
    
    error_rates.append(avg_error_rate)

In [None]:
best_kernel = np.argmin(error_rates)
best_kernel = kernels[best_kernel]

In [None]:
svc_model = svm.SVC(kernel=best_kernel)
start_time = time.time()
avg_acc, _, avg_error_rate, _ = k_fold(X, svc_model)
print("--- %s seconds ---" % (time.time() - start_time))
avg_acc, avg_error_rate

### ANN

In [None]:
lb = LabelEncoder()
y = to_categorical(lb.fit_transform(y))

In [None]:
X.shape, y.shape

In [None]:
kf = KFold(n_splits=10)
fold_no = 1
avg_acc = 0

for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    
    model = Sequential()
    model.add(Dense(X.shape[1], input_shape=(X.shape[1],), activation = 'relu'))
    model.add(Dense(64, activation = 'relu'))
    model.add(Dropout(0.25))    
    model.add(Dense(32, activation = 'relu'))
    model.add(Dropout(0.5)) 
    model.add(Dense(32, activation = 'relu'))
    model.add(Dropout(0.5)) 
    model.add(Dense(2, activation = 'softmax'))

    model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
    early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=100, verbose=1, mode='auto')

    history = model.fit(X_train, y_train, batch_size=64, epochs=40, 
                    validation_data=(X_test, y_test),
                    callbacks=[early_stop], verbose=0)

    scores = model.evaluate(X_test, y_test, verbose=0)
    avg_acc += scores[1]*100 
    fold_no += 1
    
print('avg_acc:', (avg_acc/10))

In [None]:
plot_model(model, to_file='model.png', show_shapes=True)