In [None]:
%%capture
!pip install librosa

In [None]:
from google.colab import drive
drive.mount('/content/drive')

dataset_path = '/content/drive/MyDrive/DataMan_Songs/Dataset/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import time
import pandas as pd
import os

import librosa
import librosa.display
print(librosa.__version__)


0.8.1


In [None]:
## Example dictionary structure
# data_dict = {'AvengedSevenfold_SoFarAway': {
# 					    'Guitar':{'LibrosaAudio':[],
#                             			      'Energy':[],
#                             			      BPM':[]
# 						     }
# 					    'Bass':{'LibrosaAudio':[],
#                             			      'Energy':[],
#                             			      BPM':[]
# 						     }
# 					    'Drums':{'LibrosaAudio':[],
#                             			      'Energy':[],
#                             			      BPM':[]
# 						     }
# 					    'Piano':{'LibrosaAudio':[],
#                             			      'Energy':[],
#                             			      BPM':[]
# 						     }
# 					}

In [None]:
pathAudio = "/content/drive/MyDrive/DataMan_Songs/Dataset/Songs_Instruments_AudioWav"  
data_dict = {}
df = pd.DataFrame()
band_df = []
song_df = []
for bandsong in os.listdir(pathAudio):
  band_df.append(bandsong.split('-')[0].replace('_', ' '))
  song_df.append(bandsong.split('-')[1].replace('_', ' '))

  data_dict[bandsong] = {}
  songpath = pathAudio + '/' + bandsong + '/'
  files = librosa.util.find_files(songpath, ext=['wav']) 
  files = np.asarray(files)
  for y in files:
    name = y.replace(songpath, '').strip('.wav')
    ## Build a dictionary for each instrument
    data_dict[bandsong][name] = {'LibrosaAudio': librosa.load(y, sr = 22050,mono = True)}

df['Band'] = band_df 
df['Song'] = song_df

## Energy: total magnitude of the signal
### hop_length = number of samples between successive frames
### n° samples = (seconds) * (sample rate) / (hop_length)

In [None]:
## Standard parameters: https://musicinformationretrieval.com/energy.html 

def energy(x):
  hop_length = 512  ## --> use 512 in order to have same energy array lenght and beat array lenght 
  frame_length = 512

  energy = np.array([
      sum(abs(x[i:i+frame_length]**2))
      for i in range(0, len(x), hop_length)
  ])

  data_dict[bandsong][key]['Energy']= [np.std(energy), np.mean(energy)]
  return(energy)

def bpm(x):
  onset_env = librosa.onset.onset_strength(y=x, sr=22050)   ##sr = 22050
  dtempo = librosa.beat.tempo(onset_envelope=onset_env, sr=22050,
                              aggregate=None)

  data_dict[bandsong][key]['BPM']= [np.mean(dtempo), np.std(dtempo)]
  return()

In [None]:
## Applied energy function to any songs
for bandsong in data_dict.keys():
  for key in data_dict[bandsong].keys():
    ## Now key == instrument
    energy(data_dict[bandsong][key]['LibrosaAudio'][0])
    bpm(data_dict[bandsong][key]['LibrosaAudio'][0])

In [None]:
## Function for normalizing values
def normalize(arr):
    norm_arr = []
    diff = 1
    diff_arr = max(arr) - min(arr)    
    for i in arr:
        temp = (((i - min(arr))*diff)/diff_arr)
        norm_arr.append(temp)
    return (norm_arr)

In [None]:
## Take all mean and std (both for energy and bpm) and normalize them
all_energy_mean_drums = []
all_energy_mean_piano = []
all_energy_mean_bass = []

all_energy_std_drums = []
all_energy_std_piano = []
all_energy_std_bass = []

all_bpm_mean_drums = []
all_bpm_mean_piano = []
all_bpm_mean_bass = []

all_bpm_std_drums = []
all_bpm_std_piano = []
all_bpm_std_bass = []


for key in data_dict.keys():
  ## Energy mean normalization
  try:
    all_energy_mean_drums.append(data_dict[key]['drums']['Energy'][0])
    all_energy_mean_piano.append(data_dict[key]['piano']['Energy'][0])
    all_energy_mean_bass.append(data_dict[key]['bass']['Energy'][0])

    ## Energy std normalization
    all_energy_std_drums.append(data_dict[key]['drums']['Energy'][1])
    all_energy_std_piano.append(data_dict[key]['piano']['Energy'][1])
    all_energy_std_bass.append(data_dict[key]['bass']['Energy'][1])
    
    ## BPM mean normalization
    all_bpm_mean_drums.append(data_dict[key]['drums']['BPM'][0])
    all_bpm_mean_piano.append(data_dict[key]['piano']['BPM'][0])
    all_bpm_mean_bass.append(data_dict[key]['bass']['BPM'][0])

    ## BPM std normalization
    all_bpm_std_drums.append(data_dict[key]['drums']['BPM'][1])
    all_bpm_std_piano.append(data_dict[key]['piano']['BPM'][1])
    all_bpm_std_bass.append(data_dict[key]['bass']['BPM'][1])
  except:
    i = df[(df['Band'] == key.split('-')[0]) & (df['Song'] ==  key.split('-')[1])].index[0] 
    df.drop(index=i, inplace = True)
    continue

all_energy_mean_drumsN = normalize(np.array(all_energy_mean_drums))
all_energy_mean_pianoN = normalize(np.array(all_energy_mean_piano))
all_energy_mean_bassN = normalize(np.array(all_energy_mean_bass))

all_energy_std_drumsN = normalize(np.array(all_energy_std_drums))
all_energy_std_pianoN = normalize(np.array(all_energy_std_piano))
all_energy_std_bassN = normalize(np.array(all_energy_std_bass))

all_bpm_mean_drumsN = normalize(np.array(all_bpm_mean_drums)) 
all_bpm_mean_pianoN = normalize(np.array(all_bpm_mean_piano)) 
all_bpm_mean_bassN = normalize(np.array(all_bpm_mean_bass)) 

all_bpm_std_drumsN = normalize(np.array(all_bpm_std_drums))
all_bpm_std_pianoN = normalize(np.array(all_bpm_std_piano))
all_bpm_std_bassN = normalize(np.array(all_bpm_std_bass))

In [None]:
## Upload all list to df
df['Energy_Mean_Drums_N'] = all_energy_mean_drumsN 
df['Energy_Mean_Piano_N'] = all_energy_mean_pianoN 
df['Energy_Mean_Bass_N'] = all_energy_mean_bassN 

df['Energy_std_Drums_N'] = all_energy_std_drumsN 
df['Energy_std_Piano_N'] = all_energy_std_pianoN 
df['Energy_std_Bass_N'] = all_energy_std_bassN 

df['BPM_Mean_Drums_N'] = all_bpm_mean_drumsN 
df['BPM_Mean_Piano_N'] = all_bpm_mean_pianoN 
df['BPM_Mean_Bass_N'] = all_bpm_mean_bassN 

df['BPM_std_Drums_N'] = all_bpm_std_drumsN 
df['BPM_std_Piano_N'] = all_bpm_std_pianoN 
df['BPM_std_Bass_N'] = all_bpm_std_bassN 

df.head()

In [None]:
## Convert values into categorical values.
def level(x):
  ## High Level
  if x > 0.66:
    return('High')
  if x <= 0.33:
    return('Low')
  else:
    return('Medium')

In [None]:
## Transform numeric into categorial level
df['Drums_Energy_Level'] = df['Energy_Mean_Drums_N'].apply(lambda x: level(x)) 
df['Piano_Energy_Level'] = df['Energy_Mean_Piano_N'].apply(lambda x: level(x)) 
df['Bass_Energy_Level'] = df['Energy_Mean_Bass_N'].apply(lambda x: level(x)) 

df['Drums_BPM_Level'] = df['BPM_Mean_Drums_N'].apply(lambda x: level(x)) 
df['Piano_BPM_Level'] = df['BPM_Mean_Piano_N'].apply(lambda x: level(x)) 
df['Bass_BPM_Level'] = df['BPM_Mean_Bass_N'].apply(lambda x: level(x)) 


In [None]:
## Build difficulty level by 
## Energy-BPM:
## Low-Low = Beginner
## Medium-Low = Beginner   ----> il ritmo e quindi la velocità incidono di più sulla difficoltà (è più difficile fare le cose velocemente che usare note 'energiche')
## Low-Medium = Intermidiate
## Medium-Medium = Intermidiate
## Medium-High = Advanced
## High-High = Advanced

def join_EB_classDrums(x): return (str(x['Drums_Energy_Level']) + '-' +  str(x['Drums_BPM_Level']))
def join_EB_classPiano(x): return (str(x['Piano_Energy_Level']) + '-' +  str(x['Piano_BPM_Level']))
def join_EB_classBass(x): return (str(x['Bass_Energy_Level']) + '-' +  str(x['Bass_BPM_Level']))

df['Combine_Energy_BPM_Drums'] = df.apply(join_EB_classDrums, axis=1)
df['Combine_Energy_BPM_Piano'] = df.apply(join_EB_classPiano, axis=1)
df['Combine_Energy_BPM_Bass'] = df.apply(join_EB_classBass, axis=1)

diz_label = {'Low-Low': 'Beginner',
             'Medium-Low': 'Beginner',
             'High-Low': 'Beginner',
             'Low-Medium': 'Intermidiate',
             'Medium-Medium': 'Intermidiate',
             'High-Medium': 'Intermidiate',
             'Low-High': 'Advanced',
             'Medium-High': 'Advanced',
             'High-High': 'Advanced'
} 

df['Difficulty_Level_Drums'] = df['Combine_Energy_BPM_Drums'].apply(lambda x: diz_label[x])
df['Difficulty_Level_Piano'] = df['Combine_Energy_BPM_Piano'].apply(lambda x: diz_label[x])
df['Difficulty_Level_Bass'] = df['Combine_Energy_BPM_Bass'].apply(lambda x: diz_label[x])
df.head(5)  

In [None]:
## Import dataset
dt = pd.read_csv('/content/drive/MyDrive/DataMan_Songs/Dataset/2_Dataset_SpotifyFeatures&InfoBand.csv')
dt = dt.rename(columns={'Player':'Guitarist'})
dt.drop(['ID_TRACK', 'EXTERNAL_URLS', 'SEARCH_URL', 'AVAILABLE_MARKETS'], axis=1, inplace=True)
# dt.head(3)
# dt.columns

In [None]:
## Merge dt and df
dc = dt.merge(df, on = ['Band', 'Song'])

In [None]:
## Build relevance metric
## Relevance = how is important the instrument inside the entire song
## Relevance = Energy(instrument)/Energy(song)

def relevance_Drums(x): return (float(x['Energy_Mean_Drums_N'])/float(x['energy']))
def relevance_Bass(x): return (float(x['Energy_Mean_Bass_N'])/float(x['energy']))
def relevance_Piano(x): return (float(x['Energy_Mean_Piano_N'])/float(x['energy']))


dc['Quantity_Relevance_Drums'] = dc.apply(relevance_Drums, axis = 1)
dc['Quantity_Relevance_Bass'] = dc.apply(relevance_Bass, axis = 1)
dc['Quantity_Relevance_Piano'] = dc.apply(relevance_Piano, axis = 1)

In [None]:
## Transform number into range scale (1-10). If value>1 --> np.nan

def relevance_range(x): 
  if x <= 1 and x > 0:
    return(round(x,1)*10)
  else:
    return(np.nan)

dc['Relevance_Drums'] = dc['Quantity_Relevance_Drums'].apply(lambda x: relevance_range(x))
dc['Relevance_Bass'] = dc['Quantity_Relevance_Bass'].apply(lambda x: relevance_range(x))
dc['Relevance_Piano'] = dc['Quantity_Relevance_Piano'].apply(lambda x: relevance_range(x))


In [None]:
def join_Diff_instr(x): return ([('drums' + '_' +  str(x['Difficulty_Level_Drums'])), 
                                 ('guitar' + '_' + str(x['Difficulty'])),
                                 ('piano' + '_' +  str(x['Difficulty_Level_Piano'])), 
                                 ('bass' + '_' +  str(x['Difficulty_Level_Bass']))])
dc['Instruments_Difficulty'] = dc.apply(join_Diff_instr, axis = 1)

In [None]:
dc.to_csv('/content/drive/MyDrive/DataMan_Songs/Dataset/DatasetPlotAll.csv', index=False)

In [None]:
dk = dc.copy()[['Band', 'Song', 'Guitarist', 'Technique',
                'POPULARITY', 'RELEASE_DATE', 'danceability', 'energy', 'loudness',
                'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                'valence', 'tempo', 'duration_ms', 'Singer', 'Bassist', 'Drummer',
                'Pianist','Drums_Energy_Level', 'Piano_Energy_Level','Bass_Energy_Level', 
                'Drums_BPM_Level', 'Piano_BPM_Level','Bass_BPM_Level', 
                'Relevance_Drums', 'Relevance_Bass', 'Relevance_Piano', 'Instruments_Difficulty'
                ]]
dk.to_csv('/content/drive/MyDrive/DataMan_Songs/Dataset/DatasetGrafo.csv', index=False)