# Integracja z Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive/


# Projekt

## Konfiguracja

In [0]:
config = {
    'batch_size': 32,
    'epochs': 10,
    'validation_split': 0.2,
}
constants = {
    'number_of_classes': 12,
    'data_dir': '/content/data',
    'train_data_dir': '/content/data/train',
    'test_data_dir': '/content/data/test',
    'results_dir': '/content/gdrive/My Drive/DL2020/Projekt3',
}


## Pobieranie danych treningowych z kaggle

In [4]:
!mkdir .kaggle
!mkdir /root/.kaggle
import json
token = {"username":"jacekmyna","key":"7eff50cbfe2c482b7125064101dc821d"}
with open('/content/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)
!chmod 600 /content/.kaggle/kaggle.json
!cp /content/.kaggle/kaggle.json /root/.kaggle/kaggle.json
!kaggle config set -n path -v{/content}
!kaggle competitions download -c tensorflow-speech-recognition-challenge -p /content
!7z x train.7z -o/content/data

- path is now set to: {/content}
Downloading link_to_gcp_credits_form.txt to /content
  0% 0.00/50.0 [00:00<?, ?B/s]
100% 50.0/50.0 [00:00<00:00, 40.4kB/s]
Downloading test.7z to /content
 99% 2.44G/2.46G [00:23<00:00, 179MB/s]
100% 2.46G/2.46G [00:23<00:00, 111MB/s]
Downloading train.7z to /content
 99% 1.03G/1.04G [00:23<00:00, 65.2MB/s]
100% 1.04G/1.04G [00:23<00:00, 48.1MB/s]
Downloading sample_submission.7z to /content
  0% 0.00/501k [00:00<?, ?B/s]
100% 501k/501k [00:00<00:00, 59.7MB/s]

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 1121103842 bytes (1070 MiB)

Extracting archive: train.7z
--
Path = train.7z
Type = 7z
Physical Size = 1121103842
Headers Size = 389133
Method = Delta LZMA2:24
Solid = +
Blocks = 2

  0%      0% 39 - train/audio/_backgr

## Przetwarzanie danych audio

In [0]:
import librosa
import numpy as np
from scipy import signal
from scipy.io import wavfile

SAMPLE_RATE = 16000  # 1 sec

def read_wav_file_using_librosa(filename):
    wav, sample_rate = librosa.load(filename, sr=None)  # same effect
    return wav


def read_wav_file(filename):
    sample_rate, wav = wavfile.read(filename)
    wav = wav.astype(np.float32) / np.iinfo(np.int16).max
    return wav


def to_log_spectrogram(wav, window_size=20, step_size=10, eps=1e-10):
    if type(wav) is str:
        wav = read_wav_file(wav)

    if len(wav) > SAMPLE_RATE:
        wav = _cut_sample_to_rate(wav)
    elif len(wav) < SAMPLE_RATE:
        wav = _pad_sample_to_rate(wav)

    nperseg = int(round(window_size * SAMPLE_RATE / 1e3))
    noverlap = int(round(step_size * SAMPLE_RATE / 1e3))
    frequencies, times, spectrogram = signal.spectrogram(
        wav,
        fs=SAMPLE_RATE,
        window='hann',
        nperseg=nperseg,
        noverlap=noverlap,
        detrend=False
    )
    return np.log(spectrogram.T.astype(np.float32) + eps)
    # log_spectrogram = np.log(spectrogram.T.astype(np.float32) + eps)
    # return np.reshape(log_spectrogram, newshape=(99, 161, 1))


def to_spectrogram(wav):
    if type(wav) is str:
        wav = read_wav_file(wav)

    if len(wav) > SAMPLE_RATE:
        wav = _cut_sample_to_rate(wav)
    elif len(wav) < SAMPLE_RATE:
        wav = _pad_sample_to_rate(wav)

    spectrogram = signal.stft(wav, 16000, nperseg=400, noverlap=240, nfft=512, padded=False, boundary=None)
    phase = np.angle(spectrogram[2]) / np.pi
    amp = np.log1p(np.abs(spectrogram[2]))

    return np.stack([phase, amp], axis=2)


def to_mel(wav):
    if type(wav) is str:
        wav = read_wav_file(wav)
    spectrogram = librosa.feature.melspectrogram(
        wav, sr=SAMPLE_RATE, n_mels=40, hop_length=160, n_fft=480, fmin=20, fmax=4000
    )
    spectrogram = librosa.power_to_db(spectrogram)
    return spectrogram.astype(np.float32)


def to_mfcc(wav):
    if type(wav) is str:
        wav = read_wav_file(wav)
    spectrogram = librosa.feature.melspectrogram(
        wav, sr=SAMPLE_RATE, n_mels=40, hop_length=160, n_fft=480, fmin=20, fmax=4000
    )
    idx = [spectrogram > 0]
    spectrogram[idx] = np.log(spectrogram[idx])

    dct_filters = librosa.filters.dct(n_filters=40, n_input=40)
    mfcc = [np.matmul(dct_filters, x) for x in np.split(spectrogram, spectrogram.shape[1], axis=1)]
    mfcc = np.hstack(mfcc)
    mfcc = mfcc.astype(np.float32)
    return mfcc


def raw(wav):
    if type(wav) is str:
        wav = read_wav_file(wav)

    if len(wav) > SAMPLE_RATE:
        wav = _cut_sample_to_rate(wav)
    elif len(wav) < SAMPLE_RATE:
        wav = _pad_sample_to_rate(wav)

    return wav


def _cut_sample_to_rate(sample, sample_rate=SAMPLE_RATE):
    beg = np.random.randint(0, len(sample) - sample_rate)
    return sample[beg: beg + sample_rate]


def _pad_sample_to_rate(sample, sample_rate=SAMPLE_RATE):
    remaining_len = sample_rate - len(sample)
    random_silence_sample = _get_random_silence_sample(remaining_len)
    j = np.random.randint(0, remaining_len)
    silence_part_left = random_silence_sample[0:j]
    silence_part_right = random_silence_sample[j:remaining_len]
    return np.concatenate([silence_part_left, sample, silence_part_right])


def _get_random_silence_sample(length, sample_rate=SAMPLE_RATE):
    silence_wav_file = read_wav_file(silence_df.wav_file[np.random.randint(0, len(silence_df))])
    i = np.random.randint(0, len(silence_wav_file) - length)
    return silence_wav_file[i:i + sample_rate]


def add_white_noise(wav):
    return wav + 0.005 * np.random.randn(len(wav))


def shift_left(wav):
    by = int(np.random.rand() * 800 + 1200)
    return np.roll(wav, by * -1)


def shift_right(wav):
    by = int(np.random.rand() * 800 + 1200)
    return np.roll(wav, by)


def speed_up(wav, by=0.05):
    faster_wav = librosa.effects.time_stretch(wav, 1 - by)
    return _cut_sample_to_rate(faster_wav)


def slow_down(wav, by=0.05):
    slower_wav = librosa.effects.time_stretch(wav, 1 + by)
    return _pad_sample_to_rate(slower_wav)


## Ładowanie danych

In [0]:
import os
import re
from glob import glob

import numpy as np
import pandas as pd

LABELS = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']
FILE_PATTERN = re.compile(r"(.+/)?(\w+)/([^_]+)_.+wav")  # prefix, label, user_id
COLUMNS = ['label', 'label_id', 'user_id', 'wav_file']
SAMPLE_RATE = 16000


def get_labels():
    return LABELS


def load_data(data_dir=constants['data_dir'], squeeze_train_set=True):
    files = glob(os.path.join(data_dir, 'train/audio/*/*wav'))
    with open(f"{data_dir}/train/validation_list.txt", 'r') as f:
        validation_files = f.readlines()
    validation_files = [f"{data_dir}/train/audio/{f.strip()}" for f in validation_files]

    train_samples = []
    val_samples = []
    silence_samples = []
    noise_samples = []

    for file in files:
        _, label, user_id = _split_file_name(file)
        if label == '_background_noise_':
            noise_samples.append(file)
        else:
            label = _parse_label(label)
            label_id = LABELS.index(label)
            sample = (label, label_id, user_id, file)
            if label == 'silence':
                silence_samples.append(sample)
            elif file in validation_files:
                val_samples.append(sample)
            else:
                train_samples.append(sample)

    train_samples_for_class = 2000  # max is 2095
    train_df = pd.DataFrame(train_samples, columns=COLUMNS)
    if squeeze_train_set:
        train_df = train_df.groupby('label_id', as_index=False).apply(lambda x: x.sample(n=train_samples_for_class))
    train_df = train_df.append(pd.DataFrame(silence_samples[:train_samples_for_class], columns=COLUMNS))

    val_df = pd.DataFrame(val_samples, columns=COLUMNS)
    val_df = val_df.append(pd.DataFrame(silence_samples[train_samples_for_class:], columns=COLUMNS))

    test_files = glob(os.path.join(data_dir, 'test/audio/*wav'))
    test_df = pd.DataFrame(test_files, columns=['wav_file'])

    silence_df = pd.DataFrame(noise_samples, columns=['wav_file'])

    return train_df, val_df, test_df, silence_df


def _split_file_name(file):
    match = re.match(FILE_PATTERN, file)
    if match:
        return match.group(1), match.group(2), match.group(3)
    else:
        raise Exception(f"Wrong file name: {file}")


def _parse_label(label):
    if label not in LABELS:
        return 'unknown'
    else:
        return label


def _create_random_silence_df(silence_df, length_of_df):
    df = pd.DataFrame(None, columns=COLUMNS)
    for silence_wav in silence_df.wav_file:
        samples = _split_wav_into_samples(silence_wav, number_of_samples=length_of_df // len(silence_df))
        data = [('silence', 10, 'charlie-chaplin', sample) for sample in samples]
        df = df.append(pd.DataFrame(data, columns=COLUMNS))

    # In case length_of_df isn't divisible by len(silence_df)
    samples = _split_wav_into_samples(silence_df.wav_file[0], number_of_samples=length_of_df - len(df))
    data = [('silence', 10, 'charlie-chaplin', sample) for sample in samples]
    df = df.append(pd.DataFrame(data, columns=COLUMNS))

    return df


def _split_wav_into_samples(wav, number_of_samples):
    silence_wav_file = read_wav_file(wav)
    samples = []
    for i in range(number_of_samples):
        i = np.random.randint(0, len(silence_wav_file) - SAMPLE_RATE)
        samples.append(silence_wav_file[i:i + SAMPLE_RATE])
    return samples


## Przetwarzanie nagrań ciszy

In [0]:
import os
from glob import glob

import numpy as np
from scipy.io import wavfile

SAMPLE_RATE = 16000


def create_silence_recordings(data_dir=constants['data_dir'], n_samples=2250):
    files = glob(os.path.join(data_dir, 'train/audio/_background_noise_/*wav'))
    samples = []
    for wav_file in files:
        samples += _split_wav_into_samples(wav_file, number_of_samples=n_samples // len(files))

    # In case length_of_df isn't divisible by len(silence_df)
    samples += _split_wav_into_samples(files[0], number_of_samples=n_samples - len(samples))

    save_dir = f"{constants['train_data_dir']}/audio/silence"
    os.makedirs(save_dir, exist_ok=True)
    for i, sample in enumerate(samples):
        wavfile.write(f"{save_dir}/charlie-chaplin_nohash_{i}.wav", SAMPLE_RATE, sample)


def _split_wav_into_samples(wav, number_of_samples):
    silence_wav_file = read_wav_file(wav)
    samples = []
    for i in range(number_of_samples):
        i = np.random.randint(0, len(silence_wav_file) - SAMPLE_RATE)
        samples.append(silence_wav_file[i:i + SAMPLE_RATE])
    return samples


create_silence_recordings()

In [0]:
train_df, val_df, _, _ = load_data()

## Wykresy

In [0]:
import matplotlib.pyplot as plt

epochs = range(1, last_epoch + 1)

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.set_xticks(np.arange(0, last_epoch + 1, 1))

plt.scatter(epochs, loss_array)
plt.title("Wykres funkcji straty dla zbioru treningowego")
plt.xlabel("Numer epoki")
plt.ylabel("Wartość funkcji straty")
plt.show()

In [0]:
import matplotlib.pyplot as plt

epochs = range(1, last_epoch + 1)

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.set_xticks(np.arange(0, last_epoch + 1))

plt.scatter(epochs, acc_array)
plt.title("Wykres dokładności dla zbioru walidacyjnego")
plt.xlabel("Numer epoki")
plt.ylabel("Dokładność")
plt.show()

In [0]:
import copy
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
    
def visualise_accuracy_by_class(classify_table):
    results = [ classify_table[i,i] / np.sum(classify_table[i, :]) for i in range(10)]

    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])

    plt.bar(classes, results, color = ['#7e57c2', '#ffc400'])
    plt.title("Frakcja poprawnych klasyfikacji dla poszczególnych klas")
    plt.xlabel('Klasa')
    plt.ylabel('Frakcja poprawnych klasyfikacji')
    plt.xticks(classes)
    plt.show()
    
def visualise_errors_by_class(classify_table):
    p = list()
    table = copy.deepcopy(classify_table)
    table[np.argmax(table, 0), np.argmax(table, 1)] = 0
    p.append(plt.bar(classes, table[:, 0]))
    for i in range(1, 10):
        p.append(plt.bar(classes, table[:, i], bottom = np.sum(table[:, 0:i], 1)))

    plt.title("Błędy klasyfikacji")
    plt.xticks(classes)
    plt.xlabel("Poprawna klasa")
    plt.ylabel("Liczba błędnych klasyfikacji")
    plt.legend(classes, title = "Klasa zwracana przez sieć", bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()

def visualise_errors_for_class(classify_table, class_index):
    p = list()
    table = copy.deepcopy(classify_table)
    table[np.argmax(table, 0), np.argmax(table, 1)] = 0

    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])

    plt.bar(classes, table[:, class_index])
    plt.xticks(classes)
    plt.title("Liczba błędnych klasyfikacji dla klasy: {}".format(classes[class_index]))
    plt.xlabel("Klasa zwracana przez sieć")
    plt.ylabel("Liczba błędnych klasyfikacji")
    plt.show()

In [0]:
visualise_accuracy_by_class(classify_table)

In [0]:
visualise_errors_by_class(classify_table)

In [0]:
visualise_errors_for_class(classify_table, 3)