In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mglearn
import random
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import preprocessing, neighbors, naive_bayes, tree, ensemble, svm, neural_network, metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.model_selection import KFold, StratifiedKFold
from keras.callbacks import EarlyStopping
import tensorflow as tf
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from IPython.display import Audio
from tensorflow.keras.utils import plot_model

In [None]:
def select_kbest_clf(data_frame, target, k=5):
    feat_selector = SelectKBest(f_classif, k=k)
    _ = feat_selector.fit(data_frame.drop(target, axis=1), data_frame[target])
    
    feat_scores = pd.DataFrame()
    feat_scores["F Score"] = feat_selector.scores_
    feat_scores["P Value"] = feat_selector.pvalues_
    feat_scores["Support"] = feat_selector.get_support()
    feat_scores["Attribute"] = data_frame.drop(target, axis=1).columns
    
    return feat_scores

In [None]:
def k_fold(X, model, k=10, random_state=None):
    kf = StratifiedKFold(n_splits=k, random_state=random_state)
    acc_score = []
    error_rate = []

    for train_index , test_index in kf.split(X, y):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]

        model.fit(X_train,y_train)
        pred_values = model.predict(X_test)
        
        error_rate.append(np.mean(pred_values != y_test))

        acc = accuracy_score(pred_values , y_test)
        acc_score.append(acc)

    avg_acc_score = sum(acc_score)/k
    avg_error_rate = sum(error_rate)/k
    
    return avg_acc_score, acc_score, avg_error_rate, error_rate

### Undersampling

In [None]:
df = pd.read_csv('untitled folder/edited_voices.csv')
ids = np.unique(df['id'])
columns = df.columns.values
min_val = min(df.id.value_counts())
rows = []
for user_id in ids:
    new_data = np.array(df[df['id'] == user_id].iloc[:min_val])
    rows.extend(new_data)
    
df = pd.DataFrame(columns=columns, data=rows)
df = df.sample(frac=1).reset_index(drop=True)

### Oversampling

In [None]:
df = pd.read_csv('untitled folder/edited_voices.csv')
ids = np.unique(df['id'])

columns = df.columns.values
max_val = max(df.id.value_counts())
min_val = min(df.id.value_counts())

rows = []

for user_id in ids:
    if len(df[df['id'] == user_id]) < max_val:
        data = []
        
        while len(data) < max_val:
            data.extend(np.array(df[df['id'] == user_id].iloc[:min_val]))
                        
        remove = len(data) - max_val
        data = data[:-(remove)]
        
        rows.extend(data)
    else:
        rows.extend(np.array(df[df['id'] == user_id].iloc[:max_val]))

df = pd.DataFrame(columns=columns, data=rows)
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
speakers = pd.DataFrame(columns=['recording', 'label', 'id'])
speakers['recording'] = df['recording']
speakers['label'] = df['label']
speakers['id'] = df['id']

speakers.head()

In [None]:
df = df.drop(['recording', 'label'], axis=1)
df.describe()

In [None]:
np.unique(speakers['id'].nunique(), return_counts=True)

### Feature selection

In [None]:
k = select_kbest_clf(df, 'id', k=5).sort_values(['F Score'],ascending=False)
k

In [None]:
plt.figure(figsize=(12, 10))
k1=sns.barplot(x=k['F Score'],y=k['Attribute'])
k1.set_title('Feature Importance')
plt.show()

In [None]:
df = df.drop(['id'], axis=1)

In [None]:
x_data = df
y = np.array(speakers['id'])

In [None]:
X = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
X.head()

In [None]:
X.describe()

### Random Forest

In [None]:
error_rates = []
for i in range(1, 101, 20):
    rf_model = ensemble.RandomForestClassifier(n_estimators = i)
    avg_acc, _, avg_error_rate, _ = k_fold(X, rf_model)
    error_rates.append(avg_error_rate)

In [None]:
best_n_value = np.argmin(error_rates)
best_n_value = error_rates[best_n_value]

In [None]:
rf_model = ensemble.RandomForestClassifier(n_estimators = best_n_value)
avg_acc, _, avg_error_rate, _ = k_fold(X, rf_model)
avg_acc, avg_error_rate

### SVM

In [None]:
kernels = ['rbf', 'poly', 'linear']
error_rates = []
for i in kernels:
    svc=svm.SVC(kernel=i)
    _, _, avg_error_rate, _ = k_fold(X, svc)
    
    error_rates.append(avg_error_rate)

In [None]:
best_kernel = np.argmin(error_rates)
best_kernel = kernels[best_kernel]
best_kernel

In [None]:
svc_model = svm.SVC(kernel=best_kernel)
avg_acc, _, avg_error_rate, _ = k_fold(X, svc_model)
avg_acc, avg_error_rate

# ANN for undersampled dataset

In [None]:
kf = StratifiedKFold(n_splits=10)
fold_no = 1
avg_acc = 0
ss = StandardScaler()
lb = LabelEncoder()

for train_index , test_index in kf.split(X, y):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    
    X_train = ss.fit_transform(X_train)
    X_test = ss.transform(X_test)
    
    y_train = to_categorical(lb.fit_transform(y_train))
    y_test = to_categorical(lb.fit_transform(y_test))
    
    model = Sequential()

    model.add(Dense(20, input_shape=(20,), activation = 'relu'))

    model.add(Dense(256, activation = 'relu'))
    model.add(Dense(128, activation = 'relu'))
    model.add(Dropout(0.15))   
    model.add(Dense(64, activation = 'relu')) 
    model.add(Dropout(0.2)) 
    model.add(Dense(251, activation = 'softmax'))

    model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
    early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=100, verbose=1, mode='auto')

    history = model.fit(X_train, y_train, batch_size=64, epochs=80, 
                        validation_data=(X_test, y_test),
                        callbacks=[early_stop], verbose=0)

    scores = model.evaluate(X_test, y_test, verbose=0)
    avg_acc += scores[1]*100 
    fold_no += 1
    
print('avg_acc:', (avg_acc/10))

### ANN For Oversampled

In [None]:
kf = StratifiedKFold(n_splits=10)
fold_no = 1
avg_acc = 0
ss = StandardScaler()
lb = LabelEncoder()

for train_index , test_index in kf.split(X, y):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    
    X_train = ss.fit_transform(X_train)
    X_test = ss.transform(X_test)
    
    y_train = to_categorical(lb.fit_transform(y_train))
    y_test = to_categorical(lb.fit_transform(y_test))
    
    model = Sequential()

    model.add(Dense(20, input_shape=(20,), activation = 'relu'))

    model.add(Dense(1024, activation = 'relu'))
    model.add(Dense(512, activation = 'relu'))
    model.add(Dense(512, activation = 'relu'))
    model.add(Dense(256, activation = 'relu'))
    model.add(Dense(128, activation = 'relu'))   
    model.add(Dense(128, activation = 'relu'))   
    model.add(Dense(251, activation = 'softmax'))

    model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
    early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=100, verbose=1, mode='auto')

    history = model.fit(X_train, y_train, batch_size=64, epochs=80, 
                        validation_data=(X_test, y_test),
                        callbacks=[early_stop], verbose=0)

    scores = model.evaluate(X_test, y_test, verbose=0)
    avg_acc += scores[1]*100 
    fold_no += 1
    
print('avg_acc:', (avg_acc/10))

In [None]:
plot_model(model, to_file='model_oversampling.png', show_shapes=True)