# Audio Classification

In this notebook we will try to classift audio spectograms into one of four classes: Steve Jobs, Tim Cook, Bill Gates and Elon Musk. 

Tim Cook
 - https://www.youtube.com/watch?v=2C2VJwGBRRw
 - https://www.youtube.com/watch?v=Jr4LC1q1N_g
 - https://www.youtube.com/watch?v=5fJMBW-9LTI
 
Elon Musk
 - https://www.youtube.com/watch?v=BDIRabVP24o
 - https://www.youtube.com/watch?v=nTWkdhmTyVk
 - https://www.youtube.com/watch?v=wNxAAMJBWEk




## 1. Dependencies

In [1]:
import os, sys
from matplotlib import pyplot as plt
import tensorflow as tf 
import tensorflow_io as tfio
import numpy as np
import librosa
import librosa.display

2023-09-20 18:13:50.149185: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 2. Data

The __training__ data will be used to train our model on the different patterns in Steve Jobs, Tim Cook and Bill Gates. The __validation__ data is used by the model to track how well it's doing in the current epoch. It serves as a benchmark to ensure the model isn't overfitting or underfitting and helps us adjust hyperparameters for better performance. The __test__ data represents real-world scenarios or additional tests we may want to perform to assess the model's generalization and real-world applicability.

- Training (75%)
- Validation (20%)
- Test (5%)

In [76]:
def split_mp3_to_spectograms(input_mp3, output_directory, discard_last_seconds=10, segment_duration=5, skip_duration=20):
    
    # Load the MP3 file
    y, sr = librosa.load(input_mp3)  
    y = y[:-int(discard_last_seconds * sr)] #Discard last 10 seccons
    total_duration = librosa.get_duration(y=y, sr=sr) 
    num_segments = int(np.ceil((total_duration - skip_duration) / segment_duration))

    # Split the audio into segments and save the spectrograms
    for i in range(num_segments):
        start_time = skip_duration + i * segment_duration
        end_time = min(skip_duration + (i + 1) * segment_duration, total_duration)
        segment = y[int(start_time * sr):int(end_time * sr)]

        # Calculate the spectrogram for the segment
        spectrogram = librosa.feature.melspectrogram(y=segment, sr=sr)
        
        # Crop the spectrogram to remove whitespace
        nonzero_rows, nonzero_cols = np.where(spectrogram > 0)
        spectrogram = spectrogram[np.min(nonzero_rows):np.max(nonzero_rows)+1, np.min(nonzero_cols):np.max(nonzero_cols)+1]


        # Save the spectrogram as an image
        output_filename = os.path.join(output_directory, f'spectrogram_{i + 1}.png')
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(librosa.power_to_db(spectrogram, ref=np.max), y_axis=None, x_axis=None)
        #plt.axis('off')
        #plt.colorbar(format='%+2.0f dB')
        #plt.title(f'Spectrogram of segment {i + 1}')
        plt.savefig(output_filename, bbox_inches='tight',  pad_inches=0)
        plt.close()

In [17]:
MUSK_DIR = os.path.join('data', 'Elon Musk')
COOK_DIR  = os.path.join('data', 'Tim Cook')

In [338]:
for file in os.listdir(MUSK_DIR):
    if file.endswith(".mp3"):
        mp3_file = os.path.join(MUSK_DIR, file)
        output_directory = MUSK_DIR + '/spectrograms'
        split_mp3_to_spectograms(mp3_file, output_directory)

In [295]:
for file in os.listdir(COOK_DIR):
    if file.endswith(".mp3"):
        mp3_file = os.path.join(COOK_DIR, file)
        output_directory = COOK_DIR + '/spectrograms'
        split_mp3_to_spectograms(mp3_file, output_directory)

## 3. Create Tensorflow dataset

In [222]:
musk = tf.data.Dataset.list_files(MUSK_DIR + '/spectrograms' + '/*.png')
cook = tf.data.Dataset.list_files(COOK_DIR + '/spectrograms' + '/*.png')

In [224]:
len(musk), len(cook)

(209, 251)

In [225]:
#Add labels and combine positive and negative samples
elon_musk = tf.data.Dataset.zip((musk, tf.data.Dataset.from_tensor_slices(tf.ones(len(musk))))) #1
tim_cook = tf.data.Dataset.zip((cook, tf.data.Dataset.from_tensor_slices(tf.zeros(len(cook))))) #2
data = elon_musk.concatenate(tim_cook)

In [226]:
data.as_numpy_iterator().next()

(b'data/Elon Musk/spectrograms/spectrogram_144.png', 1.0)

## 4. Preprocessing

In [11]:
from PIL import Image

In [227]:
@tf.function
def preprocess(file_path, label):
    # Decode the PNG image
    spectrogram_data = tf.io.read_file(file_path)
    spectrogram_image = tf.image.decode_png(spectrogram_data, channels=3)  # Adjust channels as needed
    
    # Redimensionar la imagen a las dimensiones deseadas (308x775)
    spectrogram_image = tf.image.resize(spectrogram_image, [308, 775])

    # Convert the image to a tensor
    spectrogram = tf.cast(spectrogram_image, tf.float32) / 255.0  # Scale to [0, 1]
    
    return spectrogram, label

In [228]:
data = data.map(preprocess)

In [229]:
sample, label = data.as_numpy_iterator().next()
label

1.0

In [230]:
data = data.cache()
data = data.shuffle(buffer_size=1000)
data = data.batch(batch_size=8)
data = data.prefetch(8)

In [231]:
sample, label = data.as_numpy_iterator().next()

In [232]:
sample

array([[[[0.        , 0.        , 0.        ],
         [0.        , 0.        , 0.        ],
         [0.        , 0.        , 0.        ],
         ...,
         [0.        , 0.        , 0.        ],
         [0.        , 0.        , 0.        ],
         [0.        , 0.        , 0.        ]],

        [[0.        , 0.        , 0.        ],
         [0.        , 0.        , 0.00784314],
         [0.        , 0.        , 0.01176471],
         ...,
         [0.33333334, 0.07843138, 0.46666667],
         [0.33333334, 0.07843138, 0.46666667],
         [0.3137255 , 0.07058824, 0.4392157 ]],

        [[0.        , 0.        , 0.        ],
         [0.        , 0.        , 0.01176471],
         [0.        , 0.        , 0.01568628],
         ...,
         [0.36078432, 0.08627451, 0.49803922],
         [0.36078432, 0.08627451, 0.49803922],
         [0.3372549 , 0.07843138, 0.47058824]],

        ...,

        [[0.        , 0.        , 0.        ],
         [0.2       , 0.05882353, 0.39607844]

In [233]:
label

array([0., 0., 0., 0., 1., 0., 1., 0.], dtype=float32)

## 5. Train, test and validation data

In [234]:
total_size = len(data)
train_size = int(len(data)*0.70)
val_size = int(len(data)*0.15)
test_size = int(len(data)*0.15)

total_size, train_size, val_size, test_size

(58, 40, 8, 8)

In [235]:
train = data.take(train_size+1)
test = data.skip(train_size+1).take(test_size)
val = data.skip(train_size+test_size+1).take(val_size)

In [236]:
samples, labels = train.as_numpy_iterator().next()

In [237]:
samples.shape #spectogram has the shape of (308,775,3)

(8, 308, 775, 3)

In [238]:
labels

array([1., 1., 0., 1., 0., 1., 1., 0.], dtype=float32)

In [239]:
model_input_shape = (samples.shape[1], samples.shape[2], samples.shape[3])
model_input_shape

(308, 775, 3)

In [240]:
train

<_TakeDataset element_spec=(TensorSpec(shape=(None, 308, 775, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.float32, name=None))>

The size is 308 x 755 with 3 channels RGB and 16 is the number of examples that is the batch size we have selected

## 5. Build, Compile and Fit the Model

In [38]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, Flatten, MaxPooling2D, Dropout

In [241]:
model = Sequential()

#Input layers
model.add(Conv2D(32, (3,3), activation='relu', input_shape=model_input_shape))
model.add(MaxPooling2D((2, 2)))

#1st Hidden layers
model.add(Conv2D(64, (3,3), activation='relu'))
model.add(MaxPooling2D((2, 2)))

#2nd Hidden layers
model.add(Conv2D(64, (3,3), activation='relu'))
model.add(MaxPooling2D((2, 2)))

#3rd Hideen layers group
model.add(Flatten())
model.add(Dense(128, activation='relu'))

#Output layer
#model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [242]:
model.compile('Adam', loss='BinaryCrossentropy', metrics=[tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])

In [243]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_12 (Conv2D)          (None, 306, 773, 32)      896       
                                                                 
 max_pooling2d_12 (MaxPooli  (None, 153, 386, 32)      0         
 ng2D)                                                           
                                                                 
 conv2d_13 (Conv2D)          (None, 151, 384, 64)      18496     
                                                                 
 max_pooling2d_13 (MaxPooli  (None, 75, 192, 64)       0         
 ng2D)                                                           
                                                                 
 conv2d_14 (Conv2D)          (None, 73, 190, 64)       36928     
                                                                 
 max_pooling2d_14 (MaxPooli  (None, 36, 95, 64)       

In [244]:
hist = model.fit(train, epochs=10, validation_data=val)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
plt.title('Loss')
plt.plot(hist.history['loss'], 'r')
plt.plot(hist.history['val_loss'], 'b')
plt.show()

In [None]:
plt.title('Precision')
plt.plot(hist.history['precision_1'], 'r')
plt.plot(hist.history['val_precision_1'], 'b')
plt.show()

In [None]:
plt.title('Recall')
plt.plot(hist.history['recall_1'], 'r')
plt.plot(hist.history['val_recall_1'], 'b')
plt.show()

## 6. Save the model

In [None]:
model.save('model.h5')

## 7. Make predictions

In [None]:
stored_model = keras.models.load_model('model.h5')

## 7.1 Get one batch and make the prediction

In [245]:
X_test, y_test = test.as_numpy_iterator().next()

In [246]:
yhat = model.predict(X_test)



In [247]:
yhat = [1 if prediction > 0.50 else 0 for prediction in yhat]

In [248]:
yhat

[0, 1, 1, 0, 1, 0, 0, 0]

In [249]:
y_test.astype(int)

array([0, 1, 1, 0, 1, 0, 0, 0])

In [250]:
y_test.astype(int) == yhat

array([ True,  True,  True,  True,  True,  True,  True,  True])

## 8. Make predictions given an .mp3

- Tim Cook: https://www.youtube.com/watch?v=prxi0LYp8yc

- Elon Musk: https://www.youtube.com/watch?v=M-ZH3psUbfU

In [251]:
UNSEEN_MUSK_DIR = os.path.join('data', 'Unseen', 'Elon Musk')
UNSEEN_COOK_DIR = os.path.join('data', 'Unseen', 'Tim Cook')

In [252]:
UNSEEN_MUSK_DIR

'data/Unseen/Elon Musk'

In [253]:
for file in os.listdir(UNSEEN_MUSK_DIR):
    print(file)

.DS_Store
1.mp3
spectrograms


In [255]:
for file in os.listdir(UNSEEN_MUSK_DIR):
    if file.endswith(".mp3"):
        mp3_file = os.path.join(UNSEEN_MUSK_DIR, file)
        output_directory = UNSEEN_MUSK_DIR + '/spectrograms'
        split_mp3_to_spectograms(mp3_file, output_directory, 403, 5, 35) #to get from 0:35 to 1:35 (1 minute)

In [91]:
for file in os.listdir(UNSEEN_COOK_DIR):
    if file.endswith(".mp3"):
        mp3_file = os.path.join(UNSEEN_COOK_DIR, file)
        output_directory = UNSEEN_COOK_DIR + '/spectrograms'
        split_mp3_to_spectograms(mp3_file, output_directory, 48, 5, 21) #to get from 0:21 to 0:50

In [256]:
@tf.function
def preprocess_v2(file_path):
    # Decode the PNG image
    spectrogram_data = tf.io.read_file(file_path)
    spectrogram_image = tf.image.decode_png(spectrogram_data, channels=3)  # Adjust channels as needed
    
    # Redimensionar la imagen a las dimensiones deseadas (308x775)
    spectrogram_image = tf.image.resize(spectrogram_image, [308, 775])

    # Convert the image to a tensor
    spectrogram = tf.cast(spectrogram_image, tf.float32) / 255.0  # Scale to [0, 1]
    
    return spectrogram

In [257]:
musk = tf.data.Dataset.list_files(UNSEEN_MUSK_DIR + '/spectrograms' + '/*.png')
cook = tf.data.Dataset.list_files(UNSEEN_COOK_DIR + '/spectrograms' + '/*.png')

elon_musk = tf.data.Dataset.zip((musk)) 
tim_cook = tf.data.Dataset.zip((cook)) 

In [258]:
elon_musk.as_numpy_iterator().next()

b'data/Unseen/Elon Musk/spectrograms/spectrogram_11.png'

In [259]:
tim_cook.as_numpy_iterator().next()

b'data/Unseen/Tim Cook/spectrograms/spectrogram_8.png'

In [260]:
elon_musk = elon_musk.map(preprocess_v2)
tim_cook = tim_cook.map(preprocess_v2)

In [261]:
elon_musk = elon_musk.cache()
elon_musk = elon_musk.shuffle(buffer_size=1000)
elon_musk = elon_musk.batch(batch_size=8)
elon_musk = elon_musk.prefetch(8)

In [262]:
tim_cook = tim_cook.cache()
tim_cook = tim_cook.shuffle(buffer_size=1000)
tim_cook = tim_cook.batch(batch_size=8)
tim_cook = tim_cook.prefetch(8)

In [263]:
X_unseen_elon = elon_musk.as_numpy_iterator().next()
X_unseen_tim = tim_cook.as_numpy_iterator().next()

## 8.1 Predict Elon Musk audio

In [325]:
def predict(item):
    yhat_unseen = model.predict(item)
    print(yhat_unseen)
    yhat_unseen = [1 if prediction > 0.80 else 0 for prediction in yhat_unseen]
    yhat_unseen_prediction.append(yhat_unseen)

In [326]:
yhat_unseen_prediction = []
X_unseen = [predict(item) for item in elon_musk] #Iterate through elon_musk

[[0.9992862]
 [0.9999893]
 [0.9850564]
 [0.7956772]
 [0.9393171]
 [0.9911692]
 [0.9997157]
 [0.9982281]]
[[0.9920328 ]
 [0.5944643 ]
 [0.99997973]
 [0.8040521 ]
 [0.23208784]]


In [327]:
yhat_unseen_prediction

[[1, 1, 1, 0, 1, 1, 1, 1], [1, 0, 1, 1, 0]]

In [328]:
result = []
for sub in yhat_unseen_prediction:
    result.extend(sub)

In [329]:
result

[1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0]

In [330]:
n_ones_elon = np.count_nonzero(np.array(result) == 1)
n_ceros_elon = np.count_nonzero(np.array(result) == 0)
n_ones_elon, n_ceros_elon

(10, 3)

In [331]:
if n_ones_elon > n_ceros_elon:
    print("The audio is from Elon Musk")
elif n_ones_elon < n_ceros_elon:
    print("The audio is from Tim Cook")
else:
    print("It has not been possible to differentiate clearly")

The audio is from Elon Musk


## 8.2 Predict Tim Cook audio

In [332]:
yhat_unseen_prediction = []
X_unseen = [predict(item) for item in tim_cook] #Iterate through tim_cook

[[0.9943121 ]
 [0.9984041 ]
 [0.00307285]
 [0.1944849 ]
 [0.6511628 ]
 [0.10884279]
 [0.68166167]
 [0.8706842 ]]
[[0.69202095]
 [0.9992778 ]
 [0.16241477]
 [0.04044723]]


In [333]:
yhat_unseen_prediction

[[1, 1, 0, 0, 0, 0, 0, 1], [0, 1, 0, 0]]

In [334]:
result = []
for sub in yhat_unseen_prediction:
    result.extend(sub)

In [335]:
result

[1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0]

In [336]:
n_ones_tim = np.count_nonzero(np.array(result) == 1)
n_ceros_tim = np.count_nonzero(np.array(result) == 0)
n_ones_tim, n_ceros_tim

(4, 8)

In [337]:
if n_ones_tim > n_ceros_tim:
    print("The audio is from Elon Musk")
elif n_ones_tim < n_ceros_tim:
    print("The audio is from Tim Cook")
else:
    print("It has not been possible to differentiate clearly")

The audio is from Tim Cook
