In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.utils.np_utils import to_categorical
from keras.callbacks import (EarlyStopping, LearningRateScheduler,
                             ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
from keras import losses, models, optimizers
from keras.activations import relu, softmax
from keras.layers import (Convolution2D, GlobalAveragePooling2D, BatchNormalization, Flatten, Dropout,
                          GlobalMaxPool2D, MaxPool2D, concatenate, Activation, Input, Dense)

# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Other  
from tqdm import tqdm, tqdm_pandas
import scipy
from scipy.stats import skew
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob 
import os
import sys
import IPython.display as ipd  # To play sound in the notebook
import warnings
# ignore warnings 
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [None]:
class AudioTransform:
    def __init__(self, always_apply=False, p=0.5):
        self.always_apply = always_apply
        self.p = p

    def __call__(self, y: np.ndarray):
        if self.always_apply:
            return self.apply(y)
        else:
            if np.random.rand() < self.p:
                return self.apply(y)
            else:
                return y

    def apply(self, y: np.ndarray):
        raise NotImplementedError


class Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        for trns in self.transforms:
            y = trns(y)
        return y


class OneOf:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        n_trns = len(self.transforms)
        trns_idx = np.random.choice(n_trns)
        trns = self.transforms[trns_idx]
        return trns(y)

In [None]:
class PitchShift(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_steps=5, sr=32000):
        super().__init__(always_apply, p)

        self.max_steps = max_steps
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        n_steps = np.random.randint(-self.max_steps, self.max_steps)
        augmented = librosa.effects.pitch_shift(y, sr=self.sr, n_steps=n_steps)
        return augmented

class VolumeControl(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, db_limit=10, mode="uniform"):
        super().__init__(always_apply, p)

        assert mode in ["uniform", "fade", "fade", "cosine", "sine"], \
            "`mode` must be one of 'uniform', 'fade', 'cosine', 'sine'"

        self.db_limit= db_limit
        self.mode = mode

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.db_limit, self.db_limit)
        if self.mode == "uniform":
            db_translated = 10 ** (db / 20)
        elif self.mode == "fade":
            lin = np.arange(len(y))[::-1] / (len(y) - 1)
            db_translated = 10 ** (db * lin / 20)
        elif self.mode == "cosine":
            cosine = np.cos(np.arange(len(y)) / len(y) * np.pi * 2)
            db_translated = 10 ** (db * cosine / 20)
        else:
            sine = np.sin(np.arange(len(y)) / len(y) * np.pi * 2)
            db_translated = 10 ** (db * sine / 20)
        augmented = y * db_translated
        return augmented

class TimeShift(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_shift_second=2, sr=32000, padding_mode="replace"):
        super().__init__(always_apply, p)
    
        assert padding_mode in ["replace", "zero"], "`padding_mode` must be either 'replace' or 'zero'"
        self.max_shift_second = max_shift_second
        self.sr = sr
        self.padding_mode = padding_mode

    def apply(self, y: np.ndarray, **params):
        shift = np.random.randint(-self.sr * self.max_shift_second, self.sr * self.max_shift_second)
        augmented = np.roll(y, shift)
        if self.padding_mode == "zero":
            if shift > 0:
                augmented[:shift] = 0
            else:
                augmented[shift:] = 0
        return augmented

class TimeStretch(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_rate=1.2):
        super().__init__(always_apply, p)

        self.max_rate = max_rate

    def apply(self, y: np.ndarray, **params):
        rate = np.random.uniform(0, self.max_rate)
        augmented = librosa.effects.time_stretch(y, rate)
        return augmented

class PitchShift(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_steps=5, sr=32000):
        super().__init__(always_apply, p)

        self.max_steps = max_steps
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        n_steps = np.random.randint(-self.max_steps, self.max_steps)
        augmented = librosa.effects.pitch_shift(y, sr=self.sr, n_steps=n_steps)
        return augmented    

In [None]:
def speedNpitch(data):
    """
    속도와 피쳐 튜닝
    """
    # you can change low and high here
    length_change = np.random.uniform(low=0.8, high = 1)
    speed_fac = 1.2  / length_change # try changing 1.0 to 2.0 ... =D
    tmp = np.interp(np.arange(0,len(data),speed_fac),np.arange(0,len(data)),data)
    minlen = min(data.shape[0], tmp.shape[0])
    data *= 0
    data[0:minlen] = tmp[0:minlen]
    return data

'''
메트릭스로 추출
'''
def prepare_data(df, n, aug, mfcc):
    X = np.empty(shape=(df.shape[0], n, 216, 1))
    input_length = sampling_rate * audio_duration
    
    cnt = 0
    for fname in tqdm(df.id):
        file_path = '/content/drive/MyDrive/voice/all/'+fname
        data, _ = librosa.load(file_path, sr=sampling_rate
                               ,res_type="kaiser_fast"
                               ,duration=2.5
                               ,offset=0.5
                              )
        transform = Compose([
        PitchShift(max_steps=2, sr=_),
        TimeStretch(),
        TimeShift(sr=_)
        ])
        y_composed = transform(data)
        _ = np.array(y_composed)
        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, int(input_length) - len(data) - offset), "constant")

        # Augmentation? 
        if aug == 1:
            data = speedNpitch(data)
        
        # which feature?
        if mfcc == 1:
            # MFCC extraction 
            MFCC = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=n_mfcc)
            MFCC = np.expand_dims(MFCC, axis=-1)
            X[cnt,] = MFCC
            
        else:
            # Log-melspectogram
            melspec = librosa.feature.melspectrogram(data, n_mels = n_melspec)   
            logspec = librosa.amplitude_to_db(melspec)
            logspec = np.expand_dims(logspec, axis=-1)
            X[cnt,] = logspec
            
        cnt += 1
    
    return X

 
    
'''
# 2D CNN model 
'''
def get_2d_conv_model(n):
    nclass = 6
    inp = Input(shape=(n,216,1))  #2D matrix of 30 MFCC bands by 216 audio length.
    x = Convolution2D(32, (4,10), padding="same")(inp)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Convolution2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Flatten()(x)
    x = Dense(64)(x)
    x = Dropout(rate=0.2)(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(rate=0.2)(x)
    
    out = Dense(nclass, activation=softmax)(x)
    model = models.Model(inputs=inp, outputs=out)
    
    opt = optimizers.Adam(0.001)
    model.compile(optimizer=opt, loss=keras.losses.SparseCategoricalCrossentropy(), metrics=['acc'])
    return model


class get_results:

    
    def __init__(self, model_history, model ,X_test, y_test, labels):
        self.model_history = model_history
        self.model = model
        self.X_test = X_test
        self.y_test = y_test             
        self.labels = labels

    def create_plot(self, model_history):
        '''Check the logloss of both train and validation, make sure they are close and have plateau'''
        plt.plot(model_history.history['loss'])
        plt.plot(model_history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()

    def create_results(self, model):
        '''predict on test set and get accuracy results'''
        opt = optimizers.Adam(0.001)
        model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
        score = model.evaluate(X_test, y_test, verbose=0)
        print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))

    def confusion_results(self, X_test, y_test, labels, model):
        '''plot confusion matrix results'''
        preds = model.predict(X_test, 
                                 batch_size=16, 
                                 verbose=2)
        preds=preds.argmax(axis=1)
        preds = preds.astype(int).flatten()
        preds = (lb.inverse_transform((preds)))

        actual = y_test.argmax(axis=1)
        actual = actual.astype(int).flatten()
        actual = (lb.inverse_transform((actual)))

        classes = labels
        classes.sort()    

        c = confusion_matrix(actual, preds)
        print_confusion_matrix(c, class_names = classes)


In [None]:
path = '/content/drive/MyDrive/voice/'
train = pd.read_csv(path + 'train.csv')
sample_submission = pd.read_csv(path + 'sample_submission.csv')

In [None]:
sampling_rate=44100
audio_duration=2.5
n_mfcc = 30
mfcc_test = prepare_data(train, n = n_mfcc, aug = 0, mfcc = 1)

  0%|          | 24/25520 [00:11<3:36:44,  1.96it/s]

In [None]:
accent_map = {}
for i, loc in enumerate(train['accent'].unique()):
    accent_map[loc] = i

In [None]:
accent_map

{'Africa': 0,
 'Australia': 1,
 'Canada': 2,
 'England': 3,
 'Hongkong': 4,
 'US': 5}

In [None]:
train['accent'] = train['accent'].map(accent_map)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(mfcc
                                                    , train.accent
                                                    , test_size=0.25
                                                    , shuffle=True
                                                    , random_state=42
                                                   )


#one hot encode the target 
#lb = LabelEncoder()
#y_train = lb.fit_transform(y_train)
#y_test = lb.fit_transform(y_test)

# Normalization as per the standard NN process
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

# Build CNN model 
model = get_2d_conv_model(n=n_mfcc)
model_history = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
                    batch_size=16, verbose = 2, epochs=20)

In [None]:
def prepare_data_test(df, n, aug, mfcc):
    X_test = np.empty(shape=(df.shape[0], n, 216, 1))
    input_length = sampling_rate * audio_duration
    
    cnt = 0
    for fname in tqdm(df.path):
        file_path = fname
        data, _ = librosa.load(file_path, sr=sampling_rate
                               ,res_type="kaiser_fast"
                               ,duration=2.5
                               ,offset=0.5
                              )
        transform = Compose([
        PitchShift(max_steps=2, sr=_),
        TimeStretch(),
        TimeShift(sr=_)
        ])
        y_composed = transform(data)
        _ = np.array(y_composed)
        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, int(input_length) - len(data) - offset), "constant")

        # Augmentation? 
        if aug == 1:
            data = speedNpitch(data)
        
        # which feature?
        if mfcc == 1:
            # MFCC extraction 
            MFCC = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=n_mfcc)
            MFCC = np.expand_dims(MFCC, axis=-1)
            X_test[cnt,] = MFCC
            
        else:
            # Log-melspectogram
            melspec = librosa.feature.melspectrogram(data, n_mels = n_melspec)   
            logspec = librosa.amplitude_to_db(melspec)
            logspec = np.expand_dims(logspec, axis=-1)
            X_test[cnt,] = logspec
            
        cnt += 1
    
    return X_test

In [None]:
"""
def get_id(data):
    return np.int(data.split("/")[-1].split(".")[-2])

test_ = pd.DataFrame(index = range(0, 6100), columns = ["path", "id"])
test_["path"] = glob("/content/drive/MyDrive/voice/test/*.wav")
test_["id"] = test_["path"].apply(lambda x : get_id(x))

test_.head()
"""

'\ndef get_id(data):\n    return np.int(data.split("/")[-1].split(".")[-2])\n\ntest_ = pd.DataFrame(index = range(0, 6100), columns = ["path", "id"])\ntest_["path"] = glob("/content/drive/MyDrive/voice/test/*.wav")\ntest_["id"] = test_["path"].apply(lambda x : get_id(x))\n\ntest_.head()\n'

In [None]:
#test_.to_csv(path+"test_.csv", index = False)

In [None]:
test = pd.read_csv(path + 'test_.csv')

In [None]:
test = test.sort_values(by=['id'], axis=0)

In [None]:
sampling_rate=44100
audio_duration=2.5
n_mfcc = 30
mfcc_test = prepare_data_test(test, n = n_mfcc, aug = 0, mfcc = 1)

In [None]:
prediction = model.predict(mfcc_test)

In [None]:
predict = pd.DataFrame(prediction)

In [None]:
accent_map

In [None]:
sample_submission.columns

In [None]:
sample_submission['africa'] = predict[0]
sample_submission['australia'] = predict[1]
sample_submission['canada'] = predict[2]
sample_submission['england'] = predict[3]
sample_submission['hongkong'] = predict[4]
sample_submission['us'] = predict[5]

In [None]:
sample_submission.to_csv(data_path+'2d_cnn.csv', index=False)

In [None]:
results = get_results(model_history,model,X_test,y_test, train.accent.unique())
results.create_plot(model_history)
results.create_results(model)
results.confusion_results(X_test, y_test, train.accent.unique(), model)

In [None]:
sampling_rate=44100
audio_duration=2.5
n_mfcc = 30
mfcc_aug = prepare_data(train, n = n_mfcc, aug = 1, mfcc = 1)

100%|██████████| 25520/25520 [4:32:19<00:00,  1.56it/s]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(mfcc_aug
                                                    , train.accent
                                                    , test_size=0.25
                                                    , shuffle=True
                                                    , random_state=42
                                                   )

# one hot encode the target 
#lb = LabelEncoder()
#y_train = to_categorical(lb.fit_transform(y_train))
#y_test = to_categorical(lb.fit_transform(y_test))

# Normalization as per the standard NN process
# mean = np.mean(X_train, axis=0)
# std = np.std(X_train, axis=0)

# X_train = (X_train - mean)/std
# X_test = (X_test - mean)/std

# Build CNN model 
model = get_2d_conv_model(n=n_mfcc)
model_history = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
                    batch_size=64, verbose = 2, epochs=30)

Epoch 1/30
300/300 - 122s - loss: 1.4920 - acc: 0.4030 - val_loss: 1.4535 - val_acc: 0.4282
Epoch 2/30
300/300 - 87s - loss: 1.2910 - acc: 0.4882 - val_loss: 1.3463 - val_acc: 0.4900
Epoch 3/30
300/300 - 87s - loss: 1.2221 - acc: 0.5250 - val_loss: 1.2564 - val_acc: 0.5219
Epoch 4/30
300/300 - 89s - loss: 1.1766 - acc: 0.5542 - val_loss: 1.2609 - val_acc: 0.4680
Epoch 5/30
300/300 - 92s - loss: 1.1286 - acc: 0.5752 - val_loss: 1.2411 - val_acc: 0.5105
Epoch 6/30
300/300 - 86s - loss: 1.0911 - acc: 0.5923 - val_loss: 1.2052 - val_acc: 0.5161
Epoch 7/30
300/300 - 87s - loss: 1.0521 - acc: 0.6029 - val_loss: 1.1797 - val_acc: 0.5398
Epoch 8/30
300/300 - 88s - loss: 1.0181 - acc: 0.6166 - val_loss: 1.1004 - val_acc: 0.5839
Epoch 9/30
300/300 - 90s - loss: 1.0009 - acc: 0.6216 - val_loss: 1.1148 - val_acc: 0.5828
Epoch 10/30
300/300 - 96s - loss: 0.9676 - acc: 0.6379 - val_loss: 1.0987 - val_acc: 0.5751
Epoch 11/30
300/300 - 87s - loss: 0.9420 - acc: 0.6421 - val_loss: 1.0780 - val_acc: 0.5

In [None]:
sampling_rate=44100
audio_duration=2.5
n_mfcc = 30
mfcc_aug_test = prepare_data_test(test, n = n_mfcc, aug = 1, mfcc = 1)

100%|██████████| 6100/6100 [58:57<00:00,  1.72it/s]


In [None]:
prediction = model.predict(mfcc_aug_test)

In [None]:
predict = pd.DataFrame(prediction)

In [None]:
sample_submission['africa'] = predict[0]
sample_submission['australia'] = predict[1]
sample_submission['canada'] = predict[2]
sample_submission['england'] = predict[3]
sample_submission['hongkong'] = predict[4]
sample_submission['us'] = predict[5]

In [None]:
sample_submission.to_csv(path+'2d_cnn_aug.csv', index=False)

In [None]:
sampling_rate=44100
audio_duration=2.5
n_melspec = 60
specgram = prepare_data(train, n = n_melspec, aug = 0, mfcc = 0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(specgram
                                                    , train.accent
                                                    , test_size=0.25
                                                    , shuffle=True
                                                    , random_state=42
                                                   )



# one hot encode the target 
lb = LabelEncoder()
y_train = to_categorical(lb.fit_transform(y_train))
y_test = to_categorical(lb.fit_transform(y_test))

# Normalization as per the standard NN process
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

# Build CNN model 
model = get_2d_conv_model(n=n_melspec)
model_history = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
                    batch_size=16, verbose = 2, epochs=20)

In [None]:
sampling_rate=44100
audio_duration=2.5
n_melspec = 60
specgram = prepare_data_test(test, n = n_melspec, aug = 0, mfcc = 0)

In [None]:
prediction = model.predict(specgram)

In [None]:
predict = pd.DataFrame(prediction)

In [None]:
sample_submission['africa'] = predict[0]
sample_submission['australia'] = predict[1]
sample_submission['canada'] = predict[2]
sample_submission['england'] = predict[3]
sample_submission['hongkong'] = predict[4]
sample_submission['us'] = predict[5]

In [None]:
sample_submission.to_csv(data_path+'2d_cnn_spac.csv', index=False)

In [None]:
sampling_rate=44100
audio_duration=2.5
n_melspec = 60
aug_specgram = prepare_data(train,  n = n_melspec, aug = 1, mfcc = 0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(aug_specgram
                                                    , train.accent
                                                    , test_size=0.25
                                                    , shuffle=True
                                                    , random_state=42
                                                   )



# one hot encode the target 
lb = LabelEncoder()
y_train = to_categorical(lb.fit_transform(y_train))
y_test = to_categorical(lb.fit_transform(y_test))

# Normalization as per the standard NN process
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

# Build CNN model 
model = get_2d_conv_model(n=n_melspec)
model_history = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
                    batch_size=16, verbose = 2, epochs=20)

In [None]:
sampling_rate=44100
audio_duration=2.5
n_melspec = 60
aug_specgram = prepare_data_test(test,  n = n_melspec, aug = 1, mfcc = 0)

In [None]:
prediction = model.predict(aug_specgram)

In [None]:
predict = pd.DataFrame(prediction)

In [None]:
sample_submission['africa'] = predict[0]
sample_submission['australia'] = predict[1]
sample_submission['canada'] = predict[2]
sample_submission['england'] = predict[3]
sample_submission['hongkong'] = predict[4]
sample_submission['us'] = predict[5]

In [None]:
sample_submission.to_csv(data_path+'2d_cnn_spac_aug.csv', index=False)