# 2D t-SNE from Autoencoder bottleneck using mel spectrogram
### from 15sec-from-the-middle audio dataset

#### Visualization using vega and altair

pip install vega

pip install altair vega_datasets

### For working on Google drive

In [None]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

### Importing

In [None]:
import glob
import numpy as np
import librosa
import os
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import pandas as pd
import vega
import altair as alt
import pandas as pd
import scipy.signal
import scipy.fftpack as fft
from keras.layers import Input, Dense
from keras.models import Model

### Read audio

In [None]:
#File reading
print('File reading...')
all_dirs = []
for root, dirs, files in os.walk('/content/drive/My Drive/MaSC Research/Datasets/Test0'):
        for name in files:
            if '.wav' in name:
                filedir = os.path.join(root, name)
                all_dirs.append(filedir)
file_no = len(all_dirs)
print('Number of files: ' + str(file_no))

### Compute mel spectrograms

In [None]:
#Feature Computation
print('Feature Computation...')
all_db = []
all_mean = []
file_names = []

for i in range(file_no):
    #only consider audio that is exactly 15seconds 
    if (librosa.get_duration(filename=all_dirs[i]) == 15.):
        file_names.append(all_dirs[i])

        #Load file
        y, sr = librosa.core.load(all_dirs[i], duration=10.)

        #Features
        S = librosa.core.stft(y=y)
        S_db = librosa.core.power_to_db(np.abs(S)**2)
        all_mean.append(np.mean(S_db)) 
        
        S_mel = librosa.feature.melspectrogram(y=y, sr=sr)
        all_db.append(S_mel.flatten()) #length: 82688

    sys.stdout.write("\rLoading %i recordings." % (i))
    sys.stdout.flush()

feature_no = len(all_db[0])      

print('Number of files with a duration of 15 seconds: ' + str(len(file_names)))

### Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(np.asarray(all_db), np.asarray(file_names), test_size=0.1, random_state=42)
print(X_train.shape)

### Model

In [None]:
#Autoencoder (2000, 100, 50, 100, 2000, 82688)
input_db = Input(shape=(feature_no,))
encoded = Dense(2000, activation='relu')(input_db)
encoded = Dense(100, activation='relu')(encoded)
encoded = Dense(50, activation='relu')(encoded)
decoded = Dense(100, activation='relu')(encoded)
decoded = Dense(2000, activation='relu')(decoded)
decoded = Dense(feature_no, activation='relu')(decoded)

autoencoder = Model(input_db, decoded)
encoder = Model(input_db, encoded)
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

### Training

In [None]:
autoencoder.fit(X_train, X_train,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_split = 0.2,
                validation_data=(X_test, X_test))
encoded_db = encoder.predict(X_train)

### Visualize

In [None]:
#Standardization
scl1 = StandardScaler()
all_db_scaled = scl1.fit_transform(encoded_db)

#TSNE
db_red2 = TSNE(n_components=2).fit_transform(all_db_scaled)

#KMeans
kmeans = KMeans(n_clusters=5, random_state=0).fit(db_red2)
clusters = kmeans.predict(db_red2)

#axis dataframes
db1 = []
db2 = []
for i in range(len(db_red2)):
    mel1.append(db_red2[i][0])
    mel2.append(db_red2[i][1])

#Clusters
df1 = pd.DataFrame({'x': np.asarray(db1), 'y': np.asarray(db2), 'color': clusters, 'path': np.asarray(y_train), 'filename': np.asarray(y_train)})
chart1 = alt.Chart(df).mark_circle(opacity=0.6, size=60).encode(x='x', y='y', color='color:N', href='path', tooltip=['filename']).interactive()

#Also make an Intensity graph
df2 = pd.DataFrame({'x': np.asarray(db1), 'y': np.asarray(db2), 'color': all_mean, 'path': np.asarray(y_train), 'filename': np.asarray(y_train)})
chart2 = alt.Chart(df).mark_circle(opacity=0.6, size=60).encode(x='x', y='y', color='color:Q', href='path', tooltip=['filename']).interactive()

#Combined AE Clusters + Intensity
df3 = pd.DataFrame({'x': np.asarray(db1), 'y': np.asarray(db2), 'color': clusters, 'path': np.asarray(y_train), 'filename': np.asarray(y_train)})
chart3 = alt.Chart(df).mark_circle(size=80).encode(x='x', y='y', color='color:N', href='path', tooltip=['filename']).interactive()
df4 = pd.DataFrame({'x': np.asarray(db1), 'y': np.asarray(db2), 'color': all_mean, 'path': np.asarray(y_train), 'filename': np.asarray(y_train)})
chart2 = alt.Chart(df).mark_circle(size=30).encode(x='x', y='y', color='color:Q', href='path', tooltip=['filename']).interactive()

display(chart1)
display(chart2)
display(chart3 + chart4)