In [1]:
import sys

In [2]:
sys.path.append("../libs/basic_units/")

In [3]:
import numpy as np
import librosa
import python_speech_features
from basic_units import cm, inch
import matplotlib.pyplot as plt
from scipy.signal.windows import hann, hamming
import tensorflow as tf
import matplotlib.pyplot as plt

In [16]:
k = 2
max_iterations = 100

In [4]:
n_mfcc = 22
n_mels = 40
n_fft = 16384 
hop_length = 2205
fmin = 0
fmax = None
rate = 44000

In [5]:
def read_file(file):
    file_contents = tf.io.read_file(file)
    return file, file_contents

In [6]:
filenames = tf.io.match_filenames_once('../data/audio_dataset/*.wav')
filename_ds = tf.data.Dataset.from_tensor_slices(filenames)

In [7]:
filename_contents_ds = filename_ds.map(read_file)

In [8]:
def get_next_chromagram(audio_file):
    print('filename %s ' % (audio_file))
    y, sr = librosa.load(audio_file, sr=rate)
    winlen=n_fft / sr
    winstep=hop_length/sr    
    mfcc_speech = python_speech_features.mfcc(signal=y, samplerate=sr, winlen=winlen, winstep=winstep,
                                          numcep=n_mfcc, nfilt=n_mels, nfft=n_fft, lowfreq=fmin, highfreq=fmax,
                                          preemph=0.0, ceplifter=0, appendEnergy=False, winfunc=hamming)   
    return mfcc_speech

In [9]:
def extract_feature_vector(chroma_data):
    num_samples, num_features = np.shape(chroma_data)
    print("Num features %d num samples %d " % (num_features, num_samples))
    freq_vals = tf.argmax(chroma_data)
    hist, bins = np.histogram(freq_vals, bins=range(num_features + 1))
    return hist.astype(float) / num_samples

In [10]:
def get_dataset():
    filename_contents_ds_enum = filename_contents_ds.enumerate()
    xs = []
    for file_obj in filename_contents_ds_enum.as_numpy_iterator():
        chroma_data = get_next_chromagram(file_obj[1][0])
        x = [extract_feature_vector(chroma_data)]
        x = np.matrix(x)
        if len(xs) == 0:
            xs = x
        else:
            xs = np.vstack((xs, x))
    
    return xs

In [11]:
def initial_cluster_centroids(X, k):
    return X[0:k, :]

In [12]:
def assign_cluster(X, centroids):
    expanded_vectors = tf.expand_dims(X, 0) # 1, 5, 12
    expanded_centroids = tf.expand_dims(centroids, 1) #2, 1, 12
    distances = tf.reduce_sum(tf.square(tf.subtract(expanded_vectors, expanded_centroids)), 2) #2, 5
    mins = tf.argmin(distances, 0)
    return mins

In [18]:
def recompute_centroids(X, Y):
    sums = tf.math.unsorted_segment_sum(X, Y, k)
    counts = tf.math.unsorted_segment_sum(tf.ones_like(X), Y, k)
    return sums / counts

In [20]:
X = get_dataset()
print(X)
print(X.shape)
centroids = initial_cluster_centroids(X, k)
i, converged = 0, False
while not converged and i < max_iterations:
    i += 1
    Y = assign_cluster(X, centroids)
    centroids = recompute_centroids(X, Y)
print(centroids)

filename b'../data/audio_dataset/cough_1.wav' 
Num features 22 num samples 16 
filename b'../data/audio_dataset/cough_2.wav' 
Num features 22 num samples 25 
filename b'../data/audio_dataset/scream_1.wav' 
Num features 22 num samples 19 
filename b'../data/audio_dataset/scream_2.wav' 
Num features 22 num samples 43 
filename b'../data/audio_dataset/scream_3.wav' 
Num features 22 num samples 61 
tf.Tensor(
[[0.         0.0625     0.125      0.1875     0.         0.
  0.         0.125      0.0625     0.0625     0.125      0.1875
  0.1875     0.         0.0625     0.1875     0.         0.
  0.         0.         0.         0.        ]
 [0.01572627 0.01       0.03947368 0.03451251 0.01       0.00819672
  0.01       0.00409836 0.         0.01315789 0.         0.03041415
  0.0379437  0.03631579 0.03631579 0.03631579 0.04528764 0.02409836
  0.07357204 0.01581395 0.02409836 0.02409836]], shape=(2, 22), dtype=float64)
