In [1]:
import sys

In [2]:
sys.path.append("../libs/basic_units/")

In [3]:
import numpy as np
import librosa
import python_speech_features
from basic_units import cm, inch
import matplotlib.pyplot as plt
from scipy.signal.windows import hann, hamming
import tensorflow as tf
import matplotlib.pyplot as plt

In [4]:
k = 2
max_iterations = 100
segment_size = 50

In [5]:
n_mfcc = 22
n_mels = 40
n_fft = 16384 
hop_length = 2205
fmin = 0
fmax = None
rate = 44000

In [6]:
def read_file(file):
    file_contents = tf.io.read_file(file)
    return file, file_contents

In [7]:
filenames = tf.io.match_filenames_once('../data/audio_dataset/*.wav')
filename_ds = tf.data.Dataset.from_tensor_slices(filenames)

In [8]:
filename_contents_ds = filename_ds.map(read_file)

In [9]:
def get_chromagram(audio_file):
    print('filename %s ' % (audio_file))
    y, sr = librosa.load(audio_file, sr=rate)
    winlen=n_fft / sr
    winstep=hop_length/sr    
    mfcc_speech = python_speech_features.mfcc(signal=y, samplerate=sr, winlen=winlen, winstep=winstep,
                                          numcep=n_mfcc, nfilt=n_mels, nfft=n_fft, lowfreq=fmin, highfreq=fmax,
                                          preemph=0.0, ceplifter=0, appendEnergy=False, winfunc=hamming)   
    return mfcc_speech

In [10]:
def extract_feature_vector(chroma_data):
    num_samples, num_features = np.shape(chroma_data)
    print("Num features %d num samples %d " % (num_features, num_samples))
    freq_vals = tf.argmax(chroma_data)
    hist, bins = np.histogram(freq_vals, bins=range(num_features + 1))
    return hist.astype(float) / num_samples

In [32]:
def get_dataset(audio_file):
    chroma_data = get_chromagram(audio_file)
    print('chroma_data', np.shape(chroma_data))
    chroma_length = np.shape(chroma_data)[0]
    print('chroma_length', chroma_length)
    xs = []
    for i in range(chroma_length // segment_size):
        chroma_segment = chroma_data[i*segment_size:(i+1)*segment_size, :]
        x = extract_feature_vector(chroma_segment)
        if len(xs) == 0:
            xs = x
        else:
            xs = np.vstack((xs, x))
    return xs

In [33]:
def initial_cluster_centroids(X, k):
    return X[0:k, :]

In [34]:
def assign_cluster(X, centroids):
    expanded_vectors = tf.expand_dims(X, 0) # 1, 5, 12
    expanded_centroids = tf.expand_dims(centroids, 1) #2, 1, 12
    distances = tf.reduce_sum(tf.square(tf.subtract(expanded_vectors, expanded_centroids)), 2) #2, 5
    mins = tf.argmin(distances, 0)
    return mins

In [35]:
def recompute_centroids(X, Y):
    sums = tf.math.unsorted_segment_sum(X, Y, k)
    counts = tf.math.unsorted_segment_sum(tf.ones_like(X), Y, k)
    return sums / counts

In [37]:
X = get_dataset('../data/TalkingMachinesPodcast.wav')
print(np.shape(X))
centroids = initial_cluster_centroids(X, k)
i, converged = 0, False
while not converged and i < max_iterations:
    i += 1
    Y = assign_cluster(X, centroids)
    centroids = recompute_centroids(X, Y)
    if i % 50 == 0:
        print('iteration', i)

segments = Y
for i in range(len(segments)):
    seconds = (i * segment_size) / float(10)
    min, sec = divmod(seconds, 60)
    time_str = '{}m {}s'.format(min, sec)
    print(time_str, segments[i].numpy())

filename ../data/TalkingMachinesPodcast.wav 
chroma_data (626, 22)
chroma_length 626
Num features 22 num samples 50 
Num features 22 num samples 50 
Num features 22 num samples 50 
Num features 22 num samples 50 
Num features 22 num samples 50 
Num features 22 num samples 50 
Num features 22 num samples 50 
Num features 22 num samples 50 
Num features 22 num samples 50 
Num features 22 num samples 50 
Num features 22 num samples 50 
Num features 22 num samples 50 
(12, 22)
iteration 50
iteration 100
0.0m 0.0s 0
0.0m 5.0s 1
0.0m 10.0s 0
0.0m 15.0s 0
0.0m 20.0s 1
0.0m 25.0s 0
0.0m 30.0s 0
0.0m 35.0s 0
0.0m 40.0s 0
0.0m 45.0s 0
0.0m 50.0s 0
0.0m 55.0s 0
