# Data Preprocessing

### Import Libraries

In [115]:
import librosa
import librosa.display
from os import listdir
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

### DataGTZAN music/speech collection

The Dataset collected for the purposes of music/speech discrimination. 
The dataset consists of 120 tracks, each 30 seconds long. 
Each class (music/speech) has 60 examples. 
The tracks are all 22050Hz Mono 16-bit audio files in .wav format.

Source : http://marsyas.info/downloads/datasets.html

In [116]:
CorpusPath = 'Data/gtzan-musicspeech-collection/Corpus/'
Corpusfiles = listdir(CorpusPath)
fn =len(Corpusfiles)

### Explore a shorter audio sample

In [117]:
import random
selected = random.choice(Corpusfiles)
print('Playing the file: '+selected)
ipd.Audio(CorpusPath+selected) 

Playing the file: corea1.wav


### Take a look at its Spectrogram

In [118]:
x, Fs = librosa.load(CorpusPath+selected)
X = librosa.stft(x)
# Xdb = librosa.amplitude_to_db(abs(X))
# plt.figure(figsize=(14, 5))
# librosa.display.specshow(Xdb, sr=Fs, x_axis='time', y_axis='hz')

### Import Basic Labeling Data

This was manually labels by me. Each audio clip is classified into Music/Speech. A '1' represents Music and '0' represents Speech.

In [119]:
import pandas as pd
basicdata = pd.read_csv('Data/gtzan-musicspeech-collection/datalables.txt')
basicdata.head()
basicdata.shape

(128, 2)

In [120]:
n_mfcc = 120
%store n_mfcc
CHUNK = 1024
%store CHUNK
frame = CHUNK*4
mfcccol = ['Filename']+['MFC'+str(i) for i in range(n_mfcc)]
# melcol = ['Filename']+['Mel'+str(j) for j in range(128)]

mfccdata = pd.DataFrame(columns=mfcccol)
# meldata = pd.DataFrame(columns=melcol)

Stored 'n_mfcc' (int)
Stored 'CHUNK' (int)


### Compute the MFCC and Mel Coefficients

In [121]:
import numpy as np
i=0
for file in Corpusfiles:
    print(str(fn-i)+ ' to go', end='\r', flush=True)
    i+=1
    x, Fs = librosa.load(CorpusPath+file)
    nS = len(x)
    npads = int(np.ceil(nS/frame))*frame-nS
    x = np.append(x, np.zeros(npads))
    segments = np.array_split(x, int(np.ceil(nS/frame)))
    for segment in segments:
        MFCCS = np.mean(librosa.feature.mfcc(y=segment, sr=Fs, n_mfcc=n_mfcc).T,axis=0)
        #     Mels   = np.mean(librosa.feature.melspectrogram(x, sr=Fs).T,axis=0)
        mfcrow   = [file]+list(MFCCS)
        mfccdata.loc[len(mfccdata)] = mfcrow
        #     melrow   = [file]+list(Mels)
        #     meldata.loc[len(meldata)] = melrow

1 to gooo

In [122]:
mfccdata = mfccdata.sort_values(['Filename'])
# meldata = meldata.sort_values(['Filename'])

In [123]:
mfccdata

Unnamed: 0,Filename,MFC0,MFC1,MFC2,MFC3,MFC4,MFC5,MFC6,MFC7,MFC8,...,MFC110,MFC111,MFC112,MFC113,MFC114,MFC115,MFC116,MFC117,MFC118,MFC119
6803,acomic.wav,-452.485882,110.522108,-6.203091,20.967435,16.284636,-7.412637,1.791724,-1.739583,-11.940215,...,0.000116,0.395817,0.226921,-0.483730,-0.194326,0.096578,-0.289534,-0.474719,0.362852,-0.146699
6745,acomic.wav,-242.290167,110.106266,-6.801406,7.214739,4.288809,5.100033,-11.211418,-7.833298,-2.325941,...,-0.060992,1.193607,1.004774,2.215885,-0.059986,0.604049,0.622065,0.177017,0.210173,0.645801
6746,acomic.wav,-237.773114,113.623767,-7.418866,9.591682,1.988646,-5.007758,-16.411686,-6.502290,-6.333039,...,0.648633,1.547008,2.224440,1.922615,0.306202,0.009298,0.462914,0.593212,0.906632,1.586772
6747,acomic.wav,-260.876273,119.695303,-16.341958,4.427830,1.866616,5.348380,-14.091294,-9.127876,-6.810789,...,1.056412,-0.145238,0.768431,0.056520,0.072339,0.284874,0.775982,-0.170908,0.367726,-0.087221
6748,acomic.wav,-254.079495,91.705930,-2.881500,20.775856,-0.608526,6.490037,-13.768665,-15.861203,-5.963068,...,0.200843,1.035666,1.189794,0.527738,0.096710,0.562118,-0.490987,-0.539511,-1.600447,-0.598782
6749,acomic.wav,-252.956609,94.987202,-17.608565,17.260579,15.064234,6.300241,-5.689058,-21.549230,-19.216557,...,1.776851,-0.287922,0.146713,1.479158,1.013513,0.439250,-1.463482,-0.621927,1.305035,1.215725
6750,acomic.wav,-212.092400,107.015052,3.837267,46.570156,10.985804,-5.488696,0.758503,-0.173916,-13.719400,...,0.653539,1.195836,1.555315,-0.514917,0.317892,1.105558,0.963274,-0.646769,0.054224,0.463167
6751,acomic.wav,-167.413658,101.680840,-11.538782,61.341183,30.588260,-22.433113,-6.058849,12.598626,-35.860182,...,0.834084,0.914354,0.203354,1.395867,0.328442,-0.042207,-0.424008,0.182495,0.396845,1.129495
6744,acomic.wav,-274.793450,108.465842,-1.362439,1.251969,8.642633,5.099052,-6.883344,-5.610451,-5.830263,...,0.578846,1.641397,0.676454,2.221654,1.632727,0.619466,0.942246,-0.201386,-0.537063,-0.165074
6752,acomic.wav,-165.435864,156.398445,-34.353559,15.693189,-15.818027,-30.808551,12.318534,3.849497,-36.024786,...,1.616228,0.134113,-0.864676,0.294150,0.510420,-0.934454,0.166013,0.875790,0.080291,0.700391


In [124]:
# meldata.head()

### Combine All Features

In [125]:
masterdata = pd.merge(basicdata, mfccdata, on='Filename')
# masterdata = pd.merge(masterdata, meldata, on='Filename')

In [126]:
masterdata.head()

Unnamed: 0,Filename,Music,MFC0,MFC1,MFC2,MFC3,MFC4,MFC5,MFC6,MFC7,...,MFC110,MFC111,MFC112,MFC113,MFC114,MFC115,MFC116,MFC117,MFC118,MFC119
0,acomic2.wav,0,-381.37243,115.904473,-7.767529,17.272836,18.665398,8.481184,-15.127962,5.863559,...,1.789633,0.343668,0.147979,-0.159037,0.173033,-0.020804,-0.359621,-0.389042,1.333884,1.188089
1,acomic2.wav,0,-452.582453,85.434822,5.612017,23.293566,23.7351,16.140874,-1.193595,-0.852877,...,-0.337636,0.158877,1.022224,0.090063,-0.038862,0.076092,0.308868,1.923489,0.472901,-0.82265
2,acomic2.wav,0,-430.9396,73.014229,6.432748,19.797547,18.280169,15.870616,-1.76686,-1.235784,...,0.188351,-0.188445,0.968007,0.397724,0.674539,1.628203,1.612114,0.91652,0.68393,-1.450503
3,acomic2.wav,0,-395.489977,50.313917,8.722072,21.144728,14.666915,18.752717,-1.368907,0.340301,...,0.367812,-0.827691,0.591447,1.030701,-0.153112,0.436231,1.093701,0.5469,0.191788,-0.138417
4,acomic2.wav,0,-404.422077,79.546213,7.253936,30.497939,-1.232805,17.624707,-13.324674,-6.360743,...,0.621292,-0.611388,1.6347,0.136049,-0.258694,0.496949,0.322957,0.32756,0.787358,0.251089


### Normalise All Features

In [127]:
# from sklearn.preprocessing import StandardScaler

# x = masterdata.iloc[:,2:].values 
# x_scaled= StandardScaler().fit_transform(x)
# normfeatures = pd.DataFrame(x_scaled)

In [128]:
# normfeatures.columns = [x+'_n' for x in list(masterdata.columns[2:])]

In [129]:
# normfeatures.head()

In [130]:
# masterdata = pd.concat([basicdata, normfeatures], axis=1)

### Create A Master Dataset

In [131]:
masterdata.head()

Unnamed: 0,Filename,Music,MFC0,MFC1,MFC2,MFC3,MFC4,MFC5,MFC6,MFC7,...,MFC110,MFC111,MFC112,MFC113,MFC114,MFC115,MFC116,MFC117,MFC118,MFC119
0,acomic2.wav,0,-381.37243,115.904473,-7.767529,17.272836,18.665398,8.481184,-15.127962,5.863559,...,1.789633,0.343668,0.147979,-0.159037,0.173033,-0.020804,-0.359621,-0.389042,1.333884,1.188089
1,acomic2.wav,0,-452.582453,85.434822,5.612017,23.293566,23.7351,16.140874,-1.193595,-0.852877,...,-0.337636,0.158877,1.022224,0.090063,-0.038862,0.076092,0.308868,1.923489,0.472901,-0.82265
2,acomic2.wav,0,-430.9396,73.014229,6.432748,19.797547,18.280169,15.870616,-1.76686,-1.235784,...,0.188351,-0.188445,0.968007,0.397724,0.674539,1.628203,1.612114,0.91652,0.68393,-1.450503
3,acomic2.wav,0,-395.489977,50.313917,8.722072,21.144728,14.666915,18.752717,-1.368907,0.340301,...,0.367812,-0.827691,0.591447,1.030701,-0.153112,0.436231,1.093701,0.5469,0.191788,-0.138417
4,acomic2.wav,0,-404.422077,79.546213,7.253936,30.497939,-1.232805,17.624707,-13.324674,-6.360743,...,0.621292,-0.611388,1.6347,0.136049,-0.258694,0.496949,0.322957,0.32756,0.787358,0.251089


In [133]:
# plt.scatter(masterdata.MFC0_n, masterdata.MFC1_n, c = masterdata.Music)

In [134]:
masterdata.to_csv('Data/MasterData.csv', index=False)

In [135]:
# Separate Features and Label
x = masterdata.loc[:, normfeatures.columns].values
y = masterdata.loc[:,['Music']].values

KeyError: "None of [Index(['MFC0_n', 'MFC1_n', 'MFC2_n', 'MFC3_n', 'MFC4_n', 'MFC5_n', 'MFC6_n',\n       'MFC7_n', 'MFC8_n', 'MFC9_n',\n       ...\n       'MFC110_n', 'MFC111_n', 'MFC112_n', 'MFC113_n', 'MFC114_n', 'MFC115_n',\n       'MFC116_n', 'MFC117_n', 'MFC118_n', 'MFC119_n'],\n      dtype='object', length=120)] are in the [columns]"

### Perform PCA To Identfie Relevant Columns

In [None]:
from sklearn.decomposition import PCA

n_pca = 50
pca = PCA(n_components=n_pca)
PCs = pca.fit_transform(x)
principalDf = pd.DataFrame(data = PCs, columns = ['PCA'+str(i) for i in range(n_pca)])
PCDf = pd.concat([principalDf, masterdata[['Music']]], axis = 1)
PCDf['Filename'] = basicdata['Filename']

In [None]:
PCDf

### Explore the Variance Ration to Determine Relevant Features

In [None]:
plt.semilogy(pca.explained_variance_ratio_.cumsum(), '--o')
# pca.explained_variance_ratio_
pca.explained_variance_ratio_.cumsum()

In [None]:
import pickle
pickle.dump(pca, open('PickledModels/'+pca.__class__.__name__, 'wb'))

### The analysis shows how each component captures of all information

In [None]:
plt.scatter(PCDf.PCA0, PCDf.PCA1, c = PCDf.Music)

In [None]:
PCDf.to_csv('Data/PrincipalComponentData.csv', index=False)