<a href="https://colab.research.google.com/github/curtiscu/LYIT/blob/master/BulkVisualisations_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Visualisations across bulk loaded data

# Setup env


In [1]:
# print all cell output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


## Google drive access

In [2]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [3]:
# test, peek at data
! ls -al '/content/drive/My Drive/groove-v1.0.0-midionly/groove/drummer1/eval_session/'

# test, modules from local  'E:\Google Drive\LYIT\Dissertation\modules'
! ls -al '/content/drive/My Drive/LYIT/Dissertation/modules/'

total 35
-rw------- 1 root root 2589 Apr 27 12:01 10_soul-groove10_102_beat_4-4.mid
-rw------- 1 root root 4793 Apr 27 12:01 1_funk-groove1_138_beat_4-4.mid
-rw------- 1 root root 3243 Apr 27 12:01 2_funk-groove2_105_beat_4-4.mid
-rw------- 1 root root 4466 Apr 27 12:01 3_soul-groove3_86_beat_4-4.mid
-rw------- 1 root root 2551 Apr 27 12:01 4_soul-groove4_80_beat_4-4.mid
-rw------- 1 root root 3798 Apr 27 12:01 5_funk-groove5_84_beat_4-4.mid
-rw------- 1 root root 3760 Apr 27 12:01 6_hiphop-groove6_87_beat_4-4.mid
-rw------- 1 root root 1894 Apr 27 12:01 7_pop-groove7_138_beat_4-4.mid
-rw------- 1 root root 2437 Apr 27 12:01 8_rock-groove8_65_beat_4-4.mid
-rw------- 1 root root 3448 Apr 27 12:01 9_soul-groove9_105_beat_4-4.mid
total 43
-rw------- 1 root root 22877 Jul  2 21:06 data_prep.py
drwx------ 2 root root  4096 May 10 13:31 __pycache__
-rw------- 1 root root  7276 Jul  6 14:14 stats_n_features.py
-rw------- 1 root root  8416 Jun  2 21:24 visualizations.py


## Auto reload module

Now using library code I've created and saved to google drive which is automatically pushed to the cloud and made available to the colab env. The autoreload stuff below should help imports to 'reimport' to load changes to the library code.

It's not the quickest/ most reliable, so if in a hurry, brute force loading of changes by restarting the runtime.

In [4]:
# tool to auto reload modules.
%load_ext autoreload

# config to auto-reload all modules, handy to make 
# writing and testing modules much easier.
%autoreload 2

## Imports and accessing lib functions

In [5]:
# install required libs
!pip install mido



In [6]:
# import my modules
import sys
sys.path.append('/content/drive/My Drive/LYIT/Dissertation/modules/')
import data_prep as dp

>> LOADING custom module, when: 2020-07-08 19:34:59.765871, module name: data_prep


In [7]:
# imports
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np


# object that provides colours for charts
from itertools import cycle



In [8]:
# testing auto reload of modules 
dp.test_function_call('bling')

Test function in data_prep.py called and worked! when: 2020-07-08 19:34:59.819207,  param:bling


In [9]:
import stats_n_features as sf

>> LOADING custom module, when: 2020-07-08 19:34:59.844594, module name: stats_n_features


In [10]:
sf.test_function_call('hello')

Test function in stats_n_features called and worked! when: 2020-07-08 19:34:59.871641,  param:hello


## Pandas display options

In [11]:
def set_pandas_display_options() -> None:
    # Ref: https://stackoverflow.com/a/52432757/
    display = pd.options.display

    display.max_columns = 1000
    display.max_rows = 2000
    display.max_colwidth = 1000
    display.width = None
    # display.precision = 2  # set as needed

set_pandas_display_options()
#pd.reset_option('all')


# Bulk Load Data

### Setup

In [12]:
metafile = '/content/drive/My Drive/groove-v1.0.0-midionly/groove/info.csv'

meta_df = pd.read_csv(metafile, dtype = {"drummer": "string", 
                                         "session" : "string", 
                                         "id": "string", 
                                         "style": "string", 
                                         "beat_type": "string", 
                                         "time_signature" : "string",
                                         "midi_filename" : "string",
                                         "audio_filename" : "string",
                                         "split" : "string"})

# show count of test/ train/ validation split specified in file...
meta_df.groupby(['split', 'drummer']).size()


split       drummer  
test        drummer1      48
            drummer3       8
            drummer5      13
            drummer7      40
            drummer8      17
            drummer9       3
train       drummer1     386
            drummer10     10
            drummer2      17
            drummer3      83
            drummer4       7
            drummer5      36
            drummer6      11
            drummer7     263
            drummer8      58
            drummer9      26
validation  drummer1      60
            drummer2       1
            drummer3       7
            drummer4       2
            drummer5       5
            drummer6       4
            drummer7      39
            drummer8       5
            drummer9       1
dtype: int64

### Filter to just the 'eval_session' songs

In [13]:
# pull out the data for just 'eval_session' ..
eval_df = meta_df[meta_df['session'].str.contains('eval_session')].copy()
eval_df.info()

# display(eval_df)
eval_df['style'].unique()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 0 to 1033
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   drummer         40 non-null     string 
 1   session         40 non-null     string 
 2   id              40 non-null     string 
 3   style           40 non-null     string 
 4   bpm             40 non-null     int64  
 5   beat_type       40 non-null     string 
 6   time_signature  40 non-null     string 
 7   midi_filename   40 non-null     string 
 8   audio_filename  40 non-null     string 
 9   duration        40 non-null     float64
 10  split           40 non-null     string 
dtypes: float64(1), int64(1), string(9)
memory usage: 3.8 KB


<StringArray>
[  'funk/groove1',  'soul/groove10',   'funk/groove2',   'soul/groove3',
   'soul/groove4',   'funk/groove5', 'hiphop/groove6',    'pop/groove7',
   'rock/groove8',   'soul/groove9']
Length: 10, dtype: string

In [14]:
eval_df.drop(columns=['audio_filename', 'duration', 'beat_type'], inplace=True)

eval_df.groupby('drummer', as_index=False).count()
print('list of unique drummer IDs: {}'.format(eval_df['drummer'].unique()))

Unnamed: 0,drummer,session,id,style,bpm,time_signature,midi_filename,split
0,drummer1,10,10,10,10,10,10,10
1,drummer5,10,10,10,10,10,10,10
2,drummer7,10,10,10,10,10,10,10
3,drummer8,10,10,10,10,10,10,10


list of unique drummer IDs: <StringArray>
['drummer1', 'drummer5', 'drummer7', 'drummer8']
Length: 4, dtype: string


In [15]:
file_prefix = '/content/drive/My Drive/groove-v1.0.0-midionly/groove/'

eval_df['long_midi_filename'] = eval_df['midi_filename'].apply(lambda x: "{}{}".format(file_prefix, x))

# eval_df.head(3)
eval_df.head(10)
eval_df.info()

Unnamed: 0,drummer,session,id,style,bpm,time_signature,midi_filename,split,long_midi_filename
0,drummer1,drummer1/eval_session,drummer1/eval_session/1,funk/groove1,138,4-4,drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid,test,/content/drive/My Drive/groove-v1.0.0-midionly/groove/drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid
1,drummer1,drummer1/eval_session,drummer1/eval_session/10,soul/groove10,102,4-4,drummer1/eval_session/10_soul-groove10_102_beat_4-4.mid,test,/content/drive/My Drive/groove-v1.0.0-midionly/groove/drummer1/eval_session/10_soul-groove10_102_beat_4-4.mid
2,drummer1,drummer1/eval_session,drummer1/eval_session/2,funk/groove2,105,4-4,drummer1/eval_session/2_funk-groove2_105_beat_4-4.mid,test,/content/drive/My Drive/groove-v1.0.0-midionly/groove/drummer1/eval_session/2_funk-groove2_105_beat_4-4.mid
3,drummer1,drummer1/eval_session,drummer1/eval_session/3,soul/groove3,86,4-4,drummer1/eval_session/3_soul-groove3_86_beat_4-4.mid,test,/content/drive/My Drive/groove-v1.0.0-midionly/groove/drummer1/eval_session/3_soul-groove3_86_beat_4-4.mid
4,drummer1,drummer1/eval_session,drummer1/eval_session/4,soul/groove4,80,4-4,drummer1/eval_session/4_soul-groove4_80_beat_4-4.mid,test,/content/drive/My Drive/groove-v1.0.0-midionly/groove/drummer1/eval_session/4_soul-groove4_80_beat_4-4.mid
5,drummer1,drummer1/eval_session,drummer1/eval_session/5,funk/groove5,84,4-4,drummer1/eval_session/5_funk-groove5_84_beat_4-4.mid,test,/content/drive/My Drive/groove-v1.0.0-midionly/groove/drummer1/eval_session/5_funk-groove5_84_beat_4-4.mid
6,drummer1,drummer1/eval_session,drummer1/eval_session/6,hiphop/groove6,87,4-4,drummer1/eval_session/6_hiphop-groove6_87_beat_4-4.mid,test,/content/drive/My Drive/groove-v1.0.0-midionly/groove/drummer1/eval_session/6_hiphop-groove6_87_beat_4-4.mid
7,drummer1,drummer1/eval_session,drummer1/eval_session/7,pop/groove7,138,4-4,drummer1/eval_session/7_pop-groove7_138_beat_4-4.mid,test,/content/drive/My Drive/groove-v1.0.0-midionly/groove/drummer1/eval_session/7_pop-groove7_138_beat_4-4.mid
8,drummer1,drummer1/eval_session,drummer1/eval_session/8,rock/groove8,65,4-4,drummer1/eval_session/8_rock-groove8_65_beat_4-4.mid,test,/content/drive/My Drive/groove-v1.0.0-midionly/groove/drummer1/eval_session/8_rock-groove8_65_beat_4-4.mid
9,drummer1,drummer1/eval_session,drummer1/eval_session/9,soul/groove9,105,4-4,drummer1/eval_session/9_soul-groove9_105_beat_4-4.mid,test,/content/drive/My Drive/groove-v1.0.0-midionly/groove/drummer1/eval_session/9_soul-groove9_105_beat_4-4.mid


<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 0 to 1033
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   drummer             40 non-null     string
 1   session             40 non-null     string
 2   id                  40 non-null     string
 3   style               40 non-null     string
 4   bpm                 40 non-null     int64 
 5   time_signature      40 non-null     string
 6   midi_filename       40 non-null     string
 7   split               40 non-null     string
 8   long_midi_filename  40 non-null     object
dtypes: int64(1), object(1), string(7)
memory usage: 3.1+ KB


### Filter song styles

In [16]:
# filter for just the selected song style
# NOTE: these are labels for each 'style' (1-10) the drummers
# were asked to play, after reviewing, the following were deemed
# most suitable/ usable as data for the project
song_styles = ['funk/groove1', 'soul/groove3', 'soul/groove4', 'hiphop/groove6', 'rock/groove8']
eval_df = eval_df[eval_df['style'].isin(song_styles)]
eval_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 0 to 1032
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   drummer             20 non-null     string
 1   session             20 non-null     string
 2   id                  20 non-null     string
 3   style               20 non-null     string
 4   bpm                 20 non-null     int64 
 5   time_signature      20 non-null     string
 6   midi_filename       20 non-null     string
 7   split               20 non-null     string
 8   long_midi_filename  20 non-null     object
dtypes: int64(1), object(1), string(7)
memory usage: 1.6+ KB


In [17]:
# display(eval_df)

### Created NamedTuple container

for more info, see...
* http://zetcode.com/python/namedtuple/
* https://dbader.org/blog/writing-clean-python-with-namedtuples

In [18]:

from collections import namedtuple

'''
  Collection object to hold in a single place all information
  related to a performance loaded from a MIDI file.

  drummer_id = String
  file_df = complete DataFrame of data, unfiltered
  file_wrapper = MidiFileWrapper instance
  tools = MidiTimingTools instance
  stats_df = DataFrame fleshed out with additional features/ metrics
'''
PerformanceData = namedtuple('PerformanceData' , 'drummer_id file_df file_wrapper tools stats_df')


'\n  Collection object to hold in a single place all information\n  related to a performance loaded from a MIDI file.\n\n  drummer_id = String\n  file_df = complete DataFrame of data, unfiltered\n  file_wrapper = MidiFileWrapper instance\n  tools = MidiTimingTools instance\n  stats_df = DataFrame fleshed out with additional features/ metrics\n'

## Load and initial filtering of data

This loads the files, purges any quantise positions (buckets) with hits per instrument > 1 in a bucket. i.e. notes happening more rapidly than we cater for, we're only handling detail to 16th notes.

In [19]:
all_drummer_data = {}

# iterate over rows with iterrows()
for index, row in eval_df.iterrows():

  # access data using column names
  next_drummer = row['drummer']
  long_name = row['long_midi_filename']
  short_name = row['midi_filename']

  # loads DataFrame (file_df), MIDI_File_Wrapper (f), and
  # the associated MidiTimingTools (mtt) objects..
  file_df, file_wrapper, mtt = dp.load_file(long_name)

  print('    > checking for errs: {}'.format(short_name))

  #### review data, see if errors to be removed...

  err_buckets = sf.get_error_buckets(file_df) # parse for problem beats
  if err_buckets.size == 0:
    print('    ...no errors to see here')
  else: # handle buckets > 1 hit for instrument
    #display(err_buckets)
    print('    __ file_df before: {}'.format(file_df.shape))
    print('    __ err_buckets removed: {}'.format(err_buckets.shape))
    file_df.drop(err_buckets.index, inplace=True) # remove errs, inplace
    print('    __ file_df after: {}'.format(file_df.shape))

  # gather together stats on cleaned up file_df
  stats_df = sf.gather_stats(file_df) # parse to gather stats

  # add tuple of data elements to dict with filename as key
  all_drummer_data[long_name] = PerformanceData(next_drummer, file_df, file_wrapper, mtt, stats_df)


FILE name: /content/drive/My Drive/groove-v1.0.0-midionly/groove/drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid
    loaded file: <midi file '/content/drive/My Drive/groove-v1.0.0-midionly/groove/drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid' type 0, 1 tracks, 1300 messages>
    time sig: <meta message time_signature numerator=4 denominator=4 clocks_per_click=24 notated_32nd_notes_per_beat=8 time=0>
    tempo: <meta message set_tempo tempo=434783 time=0>
    track count: 1, tracks: [<midi track 'MIDI' 1300 messages>]
    MIDI file type: 0
    > processing track: <midi track 'MIDI' 1300 messages>
    __notes pre-filter: [36. 37. 38. 40. 43. 44. 51. 52. 53. 55.]
    __applying filter: [44]
    __notes post filter: [36. 37. 38. 40. 43. 51. 52. 53. 55.]
    note_on span - first tick: 3 , last tick: 30658 
    good instruments: 5, {36.0: 'Bass Drum 1 (36)', 38.0: 'Acoustic Snare (38)', 43.0: 'High Floor Tom (43)', 49.0: 'Crash Cymbal 1 (49)', 51.0: 'Ride Cymbal 1 (51)'}
    ba