# Libraries 





In [None]:
import librosa
import librosa.display
import numpy as np
import IPython.display as ipd
import pandas as pd
import pickle
import warnings 

from matplotlib import pyplot as plt
from IPython.display import Audio
from IPython.core.display import display
from librosa.core import istft
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix


warnings.filterwarnings('ignore')

# Directories 

In [None]:
##Directory to Audio files, Mention your own before running the code segment
##Assigning directory for each train and test set 

train_audio='/Users/divyansh/Downloads/nsynth-train/audio/'
test_audio='/Users/divyansh/Downloads/nsynth-test/audio/'


In [None]:
##Directory to json files, Mention your own before running the code segment
##Reading Jsons files for both Train and Test dataframes

##Training dataframe with features from json file
df_train=pd.read_json(path_or_buf='/Users/divyansh/Downloads/nsynth-train/examples.json',orient='index')
##Testing dataframe with features from json file
df_test=pd.read_json(path_or_buf='/Users/divyansh/Downloads/nsynth-test/examples.json',orient='index')


# Preprocessing

In [None]:
df_train.head(5)


In [None]:
df_train.shape

In [None]:
##Showing the imbalanced distribution for instrument family in train dataframe

df_train['instrument_family'].value_counts().reindex(np.arange(0,11,1)).plot.bar()
plt.title('Instrument Family Distribution for Training')
plt.xlabel('Instrument')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
##Counting each instrument family examples for the Training DataFrame

classes= df_train['instrument_family'].value_counts(ascending=True)
classes


In [None]:
##Sample n files

df_train=df_train.groupby('instrument_family', as_index=False, #group by instrument family
                                 group_keys=False).apply(lambda df: df.sample(400))

##Dropping the 9th instrument family member cause of it's negligible contribution 

df_train= df_train[df_train['instrument_family']!=9]




In [None]:
df_train.tail(5)

In [None]:
##Sampling the instrument family distribution for training 

df_train['instrument_family'].value_counts().reindex(np.arange(0,len(classes)+1,1)).plot.bar()
plt.title('Sampled Instrument Family Distribution for Training')
plt.xlabel('Instrument Family')
plt.ylabel('Number of samples')
plt.tight_layout()
plt.show()

In [None]:
##View the train dataframe after sampling

df_train['instrument_family'].value_counts(ascending=True)


In [None]:
##Converting the train dataframe into a list

trainfile=df_train.index.tolist()


##Saving the train list to a pickle file

with open('/Users/divyansh/Downloads/Train-Test Data/trainfile.pickle','wb') as f:
    pickle.dump(trainfile,f)



In [None]:
##Test Dataframe

##Sampling the instrument family distribution for testing 

df_test['instrument_family'].value_counts().reindex(np.arange(0,11,1)).plot.bar()
plt.title('Sampled Instrument Family Distribution for Testing')
plt.xlabel('Instrument Family')
plt.ylabel('Number of samples')
plt.tight_layout()
plt.show()







In [None]:
##Converting the test dataframe into a list
testfile=df_test.index.tolist()



In [None]:
##Saving the test file to a pickle file

with open('/Users/divyansh/Downloads/Train-Test Data/testfile.pickle','wb') as f:
    pickle.dump(testfile,f)

# Feature Extraction

In [None]:
#Feature Extraction and Visualization

def features_extract(file):
    
    """
    Defining function that takes in a file and returns features in an array
    """
    
    #Getting a Wave Representation for the audio file
    
    w, sr = librosa.load(file)
    w,index =librosa.effects.trim(w)
        
    #Graphs and Values resulting from Freq Domain and dB powered are just for experimental purposes
    #Just to realize how we are actually doing the process mathematically in terms of visual graphs
    
    plt.show()

    ##Determine if the instrument is Harmonic or Percussive 
    
    w_harmonic, w_percussive = librosa.effects.hpss(w)
    if np.mean(w_harmonic)>np.mean(w_percussive):
        harmonic=1
    else:
        harmonic=0
    
    ##Mel Scaled Spectrogram

    spectrogram= librosa.feature.melspectrogram(w,sr=sr, hop_length=512, n_mels=128,fmax=8000)
    
    #librosa.display.specshow(spectrogram, sr=sr, hop_length=512, x_axis='time',y_axis='mel')
    #plt.title('Mel Scaled')
    #plt.show()
    
    ##Log powered spectrum
    db_melscaled= librosa.power_to_db(spectrogram,ref=np.max)
    
    #librosa.display.specshow(db_melscaled,sr=sr,hop_length=512,x_axis='time',y_axis='mel')
    #plt.colorbar(format='%+2.0f dB')
    #plt.tight_layout()
    #plt.title('Decibel')
    #plt.show()
    
    #We take average for each feature to fix MxN array into a table
    
    #Temporal Averaging
    spectrogram=np.mean(spectrogram,axis=1)
    
    #print("Shape for Spectrogram",spectrogram.shape)

    
    ##Mel-Frequency Cepstral Coefficients (MFCCs)
    dbmelscaled = istft(db_melscaled)
    mfcc = librosa.feature.mfcc(dbmelscaled, sr=sr, n_mfcc=13)
    #librosa.display.specshow(mfcc,x_axis='time')
    #plt.title('Mfcc')
    #plt.show()
    
    ##Temporal Averaging
    mfcc=np.mean(mfcc,axis=1)
    #print(mfcc.shape)
    
    ##Compute Chroma Energy
    chroma = librosa.feature.chroma_cens(w, sr=sr)
    #librosa.display.specshow(chroma,x_axis='time',y_axis='mel')
    #plt.title('Chroma')
    #plt.show()
    
    
    #Temporally Average Chroma
    chroma = np.mean(chroma, axis = 1)
    #print("Chroma points shape",chroma.shape)
    
    ##Compute Spectral Contrast
    contrast = librosa.feature.spectral_contrast(w, sr=sr)
    
    #librosa.display.specshow(contrast,x_axis='time')
    #plt.title('Contrast')
    #plt.ylabel('Frequency bands')
    #plt.colorbar(format='%+2.0f dB')
    #plt.tight_layout()
    #plt.show()
    
    #Temporally Average Contrast
    contrast = np.mean(contrast, axis= 1)
    
    #print("Contrast",contrast.shape)
    
    #Passing the specific features we are gonna be putting into our train/test sets
    return [harmonic,spectrogram,mfcc,chroma,contrast]

    
    



# Instrument Class 

In [None]:
def instrument_class(file):
    """
    Function that takes in a file and returns label i.e. instrument based on naming convention
    """
    
    ##All the 10 instruments serving as labels in the dataset
    labels=['brass','bass','flute','guitar','keyboard','mallet','organ','reed','string','synth_lead','vocal']
    
    for name in labels:
        if name in file:
            return labels.index(name)
        else:
            None

# Train Features

In [None]:
#Training dataframe with audio .wav files


#Dictionary for storing features in the train file
traindict = {}

#Running a loop over each file in the list
#Storing the features extracted from the audio files to the train dict
#'train_audio' being the directory to audio files

for file in trainfile:
    features = features_extract(train_audio +file+ '.wav') ##specifying the directory and .wav
    traindict[file]= features

#Results of the specific audio files corresponding to json files in the filename_train

In [None]:
#Convering the dictionary into DataFrame
ftrain= pd.DataFrame.from_dict(traindict,orient='index',columns=['harmonic','spectrogram','mfcc','chroma','contrast'])

train_df=ftrain
train_df

In [None]:
#While passing the Train Feature DataFrame, Classifier is throwing an error because of the value sequence
#i.e. List for a single column value

#Indexed columns for each set

#Mel-Spectrogram
train_spectrogram = pd.DataFrame(train_df.spectrogram.values.tolist(),index=train_df.index)
train_spectrogram = train_spectrogram.add_prefix('mspec_')

#MFCC
train_cepstrum = pd.DataFrame(train_df.mfcc.values.tolist(),index=train_df.index)
train_cepstrum = train_cepstrum.add_prefix('mfcc_')

#ChromaEnergy
train_chromaenergy = pd.DataFrame(train_df.chroma.values.tolist(),index=df_train.index)
train_chromaenergy = train_chromaenergy.add_prefix('chroma_')

#Contrast
train_spectralcontrast = pd.DataFrame(train_df.contrast.values.tolist(), index=df_train.index)
train_spectralcontrast = train_spectralcontrast.add_prefix('contrast-')

#Chuck out the old columns
train_df= train_df.drop(labels=['spectrogram','mfcc','chroma','contrast'],axis=1)

#Concatenate the new indexed features with file name and tagets

train_features = pd.concat([train_df,train_spectrogram,train_cepstrum,train_chromaenergy,train_spectralcontrast],axis=1, join='inner')
train_features



In [None]:
#Defining the target labels for training file
#Running the loop over each name in train feature file 

targets_train=[]
for name in train_features.index.tolist():
    targets_train.append(instrument_class(name))

#Adding a column of targets into the train feature file     
train_features['targets']=targets_train
train_features


In [None]:
#Saving training features with targets in a pickle file

with open('/Users/divyansh/Downloads/Train-Test Data/train_features.pickle','wb') as f:
    pickle.dump(train_features,f)


# Test Features

In [None]:
#Dictionary to store all test features
testdict={}

#Loop over each file in test file 
#Test_audio being the directory to audio files in test dataset
#Putting the features extracted into test dict

for file in testfile:
    features=features_extract(test_audio +file+ '.wav') #specifying directory and .wav
    testdict[file]=features



#Results of the specific audio files corresponding to json files in the

In [None]:
#Convert dict to DataFrame
testfeatures=pd.DataFrame.from_dict(testdict, orient='index',columns=['harmonic','spectrogram','mfcc','chroma','contrast'])

test_df=testfeatures
test_df


In [None]:
##DO THE SAME INDEXING AND CONCATENATION FOR TEST FEATURES

#While passing the Test Feature DataFrame, Classifier is throwing an error because of the value sequence
#Indexed columns for each set

#Mel-Spectrogram
spectrogram_test= pd.DataFrame(test_df.spectrogram.values.tolist(),index=test_df.index)
spectrogram_test = spectrogram_test.add_prefix('mspec_')

#MFCC
mfcc_test= pd.DataFrame(test_df.mfcc.values.tolist(),index=test_df.index)
mfcc_test=mfcc_test.add_prefix('mfcc_')

#ChromaEnergy
chroma_test= pd.DataFrame(test_df.chroma.values.tolist(),index=test_df.index)
chroma_test= chroma_test.add_prefix('chroma_')

#Contrast
contrast_test= pd.DataFrame(test_df.contrast.values.tolist(), index=test_df.index)
contrast_test= contrast_test.add_prefix('contrast-')

#Chuck out the old columns
test_df = test_df.drop(labels=['spectrogram','mfcc','chroma','contrast'],axis=1)

#Concatenate the new indexed features with file name and tagets

test_features= pd.concat([test_df, spectrogram_test, mfcc_test, chroma_test, contrast_test],axis=1, join='inner')
test_features

In [None]:
##Defining target variables for testing set

targets_test= []

for name in test_features.index.tolist():
    targets_test.append(instrument_class(name))

##Putting the target column in the test feature file
test_features['targets']= targets_test
test_features


In [None]:
#Saving the features list for Test set in pickle file

with open('/Users/divyansh/Downloads/Train-Test Data/test_features.pickle','wb') as f:
    pickle.dump(test_features,f)


## References

### 1) [Enthought](https://www.youtube.com/watch?v=MhOdbtPhbLU) 

- Even though Librosa is quite an easy library to implement.I followed this guy to understand how to work on such signals to get interesting representations and plots.

### 2) [Valerio Velardo - The Sound of AI](https://www.youtube.com/watch?v=WJI-17MNpdE)
- I had a tough time trying to understand how MFCCs work computationally if we were to calculate it step by step using python, this guy had really cool explanations on his channel about Mel Spectrograms and MFCCs.












## Shortcoming of the Code









- The Train file for NSynth was for about 25-26gb in size, so instead we used the valid dataset as a train set to train the model.



- To test it on actual songs with good amount of runtime to figure out between each feature overlap, what all 'multiple' instruments are playing in the background.




- Get live visualization results off playing an AudioClip about the pitch,amplitude etc, kind of how a Equaliser looks like in a music system. [Not edit the freq nobs ofcourse, just to feel of what's happening during the entire flow of song would have been interesting]