# Dig-That-Lick

- Author: geoffroy.peeters@telecom-paris.fr
- Date: 2021-12-20

Set of tools to merge and convert all the files into a single json file

![](./IMG_0921.jpg)

# Configuration

In [None]:
do_verbose = True

do_localaudio = False
do_pitchcsv = False
do_fingerprint = False

# --- targetkeyname TKN

ROOT = '/Users/peeters/Dropbox/_work/_dtl/DTL1000/'
# ---------------------
style_FILE = ROOT + 'styles-20210503.csv'
style_TKN = 'metadata_#styles_csv'


# ---------------------
audio_DIR1 = '/Volumes/peeters/_2020-05_dig-that-lick/IUC-audio-files/*.aiff'
audio_DIR2 = '/Volumes/peeters/_2020-05_dig-that-lick/JE-audio-files/*.wav'
audio_TKN = 'filename_#local'

# ---------------------
audioduration_FILE = ROOT + '2021-06-29_file_duration.json'

# ---------------------
# segmentation
# ---------------------
segment_FILE = ROOT + 'DTL_1000_segmentations-20211213.csv'
#segment_TKN = 'segment'
segment_segment_TKN = 'segment_#DTL_1000_segmentations_csv'
segment_metadata_TKN = 'metadata_#DTL_1000_segmentations_csv'

mapfilename_FILE = ROOT + 'DTL_file_map.csv'

# ---------------------
# solo segment and metadata
# ---------------------
metadataANDsolo_FILE = ROOT + 'metadata_full_compressed_v12-SD-20211217.csv'
metadataANDsolo_solo_TKN = 'solo_#metadata_full_compressed_csv'
metadataANDsolo_metadata_TKN = 'metadata_#metadata_full_compressed_csv'

# ---------------------
# solo transcription
# ---------------------
pitch_DIR = ROOT + 'with_phrase_info_20210522/'
pitch_TKN = 'solo_transcription'


# ---------------------
# simon meta-data
# ---------------------
csv1960_FILE_l = [ROOT + 'DTL1000-metadata-20211216/1960s.csv_110_musinstr.json',
         ROOT + 'DTL1000-metadata-20211216/1970s.csv_110_musinstr.json',
         ROOT + 'DTL1000-metadata-20211216/1980s.csv_110_musinstr.json', 
         ROOT + 'DTL1000-metadata-20211216/1990s.csv_110_musinstr.json',
         ROOT + 'DTL1000-metadata-20211216/2000s.csv_110_musinstr.json',
         ROOT + 'DTL1000-metadata-20211216/2010s.csv_110_musinstr.json']
csv1960_TKN = 'metadata_#19**s.csv_110_musinstr_json'

originalfile_FILE = ROOT + 'id_dtl1000-20210625.csv'
originalfile_TKN = 'filename_original_#id_dtl1000-csv'

mpal_FILE = ROOT + 'MPAL_CD_List_20210829.csv'
mpal_TKN = 'metadata_#MPAL_CD_List_csv'

#JeCompleteIndex_FILE = '/Users/peeters/_work/_projet/_2020_DigThatLick/_data/_new_simon2/DTL1000-metadata-20210411/JECompleteIndex_cleaned.csv'
JeCompleteIndex_FILE = ROOT + 'JECompleteIndex_20210719.csv'
JeCompleteIndex_TKN = 'metadata_#JE_Complete_Index_csv'

musicbrainz_FILE = ROOT + 'MBquery-20210625.csv'
musicbrainz_TKN = 'metadata_#musicbrainz'

lord_sql_musiciansID_FILE = ROOT + 'musicians.json'
instrument_acronym_mapping_FILE = ROOT + 'UIUC-instruments-20211216.xlsx'

output_FILE = './dtl_1000-2021-12-17a.json'

# import

In [None]:
import glob
import numpy as np
import pprint as pp

import csv
import json

from tqdm.notebook import tqdm, trange
import time
import librosa
import os
import pandas as pd

# define set of tools

In [None]:
def read_csv(file, separator=';'):
    
    csv_l = []
    with open(file) as csvfile:
        myreader = csv.reader(csvfile, delimiter=separator, quotechar='"')
        for row in myreader:
            csv_l.append(row)

    #print('---------------------------------')
    print('>>> reading: {} -> {} lines'.format(file, len(csv_l)))
    #print('---------------------------------')
    return csv_l

In [None]:
def convert_dict(csv_l):
    dict_l = []
    nb_row = len(csv_l)
    nb_col = len(csv_l[0])
    for row in range(1, nb_row):
        if len(csv_l[row])>1:
            dict_d = {}
            assert len(csv_l[row])==nb_col, 'wrong number of column for entry {}: requiring {} but get {}'.format(row, nb_col, len(csv_l[row]))
            for col in range(0, nb_col):
                dict_d[csv_l[0][col]] = csv_l[row][col]
            dict_l.append(dict_d)
    #print('---------------------------------')
    print('>>> converted to {} entries with keys {}'.format(len(dict_l), dict_l[0].keys()))
    #print('---------------------------------')
    return dict_l

In [None]:
def convert_to_string(start_sec):
    start_ms = int(np.floor(100000*(start_sec - np.floor(start_sec))))
    start_m, start_s = divmod(np.floor(start_sec), 60)
    start_h, start_m = divmod(start_m, 60)
    output = '%01d.%02d.%02d.%06d' % (start_h, start_m, start_s, start_ms)
    return output

In [None]:
def convert_string_to_time(string):
    h, m, s_ms = string.split(':')
    s, ms = s_ms.split('.')
    time = 60*float(m) + float(s) + float(ms)/100000    
    return time

In [None]:
def remove_key(data_d, key_l):
    for key in key_l:
        if key in data_d.keys():
            data_d.pop(key)
    return data_d

In [None]:
def copy_key(input_d, output_d, key_l):
    for key in key_l:
        if key in input_d.keys():
            if key in output_d.keys():
                if isinstance(input_d[key], float):
                    assert output_d[key]-input_d[key]<0.1, 'output_d[{}] have already a value "{}" which is different from "{}"'.format(key, input_d[key], output_d[key])
                else:
                    assert output_d[key] == input_d[key], 'output_d[{}] have already a value "{}" which is different from "{}"'.format(key, input_d[key], output_d[key])
            output_d[key] = input_d[key]
    return output_d

In [None]:
def get_unique(data_dl, key):
    return sorted(set([data[key] for data in data_dl]))

In [None]:
# --- from https://stackoverflow.com/questions/32815640/how-to-get-the-difference-between-two-dictionaries-in-python
def flatten_it(d):
    if isinstance(d, list) or isinstance(d, tuple):
        return tuple([flatten_it(item) for item in d])
    elif isinstance(d, dict):
        return tuple([(flatten_it(k), flatten_it(v)) for k, v in sorted(d.items())])
    else:
        return d

### map function

In [None]:
def map_data(data1_dl, key1, data2_dl, key2, key1add, do_replace=False):
    
    for data1_d in data1_dl: 
        data1_d['ok'] = 0
    for data2_d in data2_dl: 
        data2_d['ok'] = 0

    for data1_d in data1_dl:
        
        for data2_d in data2_dl:
            
            if data1_d[key1] ==  data2_d[key2]:
                
                data1_d['ok'] = 1
                data2_d['ok'] = 1
                    
                if do_replace:
                    # --- before replacing check that the value is the same
                    if key1add in data1_d.keys():
                        diff = set(flatten_it(data1_d[key1add] )) - set(flatten_it(data2_d))
                        if len(diff): print(diff)

                    data1_d[key1add] = data2_d
                else:
                    if key1add not in data1_d.keys():
                        data1_d[key1add] = []
                    
                    data1_d[key1add].append(data2_d)
    
    # --- display INFO
    L1 = len(data1_dl)
    L1unique = len(get_unique(data1_dl, key1))
    L1mapped = len([data1_d for data1_d in data1_dl if data1_d['ok']==1])
    L2 = len(data2_dl)
    L2unique = len(get_unique(data2_dl, key2))
    L2mapped = len([data2_d for data2_d in data2_dl if data2_d['ok']==1])
    print('---------------------------------')
    print('do_replace: {}'.format(do_replace))
    print('mapping data1[{}] to data2[{}] -> storing in data1[{}]'.format(key1, key2, key1add))
    print('data1 len-mapped:{}/ len(unique()):{}/ len:{}'.format(L1mapped, L1unique, L1))
    print('data2 len-mapped:{}/ len(unique()):{}/ len:{}'.format(L2mapped, L2unique, L2))
    print('---------------------------------')


    # --- remove 'ok' fields
    for data1_d in data1_dl:
        # --- data1_dl[:]['ok']=1
        data1_d.pop('ok')

        if key1add in data1_d.keys():
            if isinstance(data1_d[key1add], list):
                # --- data1_dl[:][key1add][:]['ok']=1   do_replace=False -> list
                for data_d in data1_d[key1add]:
                    if isinstance(data_d, dict):
                        if 'ok' in data_d.keys():
                            data_d.pop('ok')
            else:
                # --- data1_dl[:][key1add]['ok']=1   do_replace=True
                if 'ok' in data1_d[key1add].keys():
                    data1_d[key1add].pop('ok')
                    
        
    return data1_dl

In [None]:
def convert_to_map(data_dl, key):
    map_d = {}
    for data_d in data_dl:
        map_d[data_d[key]] = data_d
    return map_d

# Create intrument_acronym mapping

In [None]:
# --- 2021/07/16 based on .xls
assert os.path.exists(instrument_acronym_mapping_FILE), print(instrument_acronym_mapping_FILE)

df = pd.read_excel(instrument_acronym_mapping_FILE)
value1 = df['Unnamed: 1'].values[1:]
value2 = df['Unnamed: 2'].values[1:]
instrument_acronym_map = {}
for num in range(len(value1)):
    instrument_acronym_map[ value2[num]] =  value2[num]
    instrument_acronym_map[value1[num].strip(' ')] = value2[num]
instrument_acronym_map

In [None]:
if False:
    # --- 2021/07/19: based on Polina csv
    instrument_acronym_map = {}
    with open(ROOT + 'orig2DTL_instruments.csv') as csvfile:
            myreader = csv.reader(csvfile, delimiter=',', quotechar='"')
            for row in myreader:
                for num_col in range(1, len(row)):
                    if len(row[num_col]):
                        instrument_acronym_map[row[num_col]]  = row[1]
    instrument_acronym_map

### parse_performer_name

In [None]:
def parse_performer_names(performer_names_str, musician_map=[], instrument_map=[]):
    #print('----', performer_names_str)
    musiciansANDinstruments_str_l = performer_names_str.split(')')
    musiciansANDinstruments_str_l = musiciansANDinstruments_str_l[:-1]
        
    #print(performer_names_str)
    #print('->', musiciansANDinstruments_str_l)
    
    musician_instrument_dl = []
    
    not_correct_l = []
    for musiciansANDinstruments_str in musiciansANDinstruments_str_l:
        musiciansANDinstruments_str = musiciansANDinstruments_str.lstrip(',')
        
        musicians_str, instruments_str = musiciansANDinstruments_str.split('(')
        musician_str_l = musicians_str.split(',')
        instrument_str_l = instruments_str.split(', ')
        
        #print(musicians_str)
        #print('->', musician_str_l)
        #print(instruments_str)
        #print('->', instrument_str_l)

        for musician_str in musician_str_l:
            musician_str = musician_str.lstrip(' ')
            musician_str = musician_str.rstrip(' ')

            for instrument_str in instrument_str_l:
                
                # --- NEW 2021/07/16/ instrument acronym mapping
                if instrument_str not in instrument_acronym_map.keys():
                    print('key "{}"" not in instrument map'.format(instrument_str))
                    not_correct_l.append(instrument_str)
                else:
                    instrument_str = instrument_acronym_map[instrument_str]
                
                if (type(musician_map) is dict) and (type(instrument_map) is dict):
                    if musician_str in musician_map.keys(): 
                        musician_id = musician_map[musician_str]
                    else: 
                        musician_id = 0
                    if instrument_str in instrument_map.keys(): 
                        instrument_id = instrument_map[instrument_str]
                    else: 
                        instrument_id = 0
                    musician_instrument_dl.append({'performer_name':musician_str, 'Lord_performer_ID':musician_id, 'instrument':instrument_str, 'Lord_instrument_ID':instrument_id})
                else:
                    musician_instrument_dl.append({'performer_name':musician_str, 'instrument':instrument_str})

    return musician_instrument_dl, not_correct_l

In [None]:
parse_performer_names("Anthony Braxton (cl, bcl, bs, ss, as), Muhal Richard Abrams (p)")

In [None]:
parse_performer_names("Shelton Hemphill, Wardell Jones, Ed Anderson (t), Harry White, Henry Hicks (tb), Castor McCord (cl, ts), Ted McCord (as, cl), Crawford Wethington (as, bar, cl), Edgar Hayes (p), Benny James (bj), Hayes Alvis (sb), Willie Lynch (d), George Morton (v), Nat Leslie (a).")

# Read style file

**Load**

- 1060 unique files
    - 400 from JE (Jazz Encyclopedia)
    - 660 AQA (from Illinois) 

In [None]:
style_dl = convert_dict(read_csv(style_FILE, separator=','))

# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    print('number of entry: ', len(style_dl))
    print('number of unique key: ', len(get_unique(style_dl, 'file')))
    pp.pprint( style_dl[0:5] )

**Clean and move fields**

In [None]:
for style_d in style_dl:
    style_d[style_TKN] = {'style': style_d['style']}
    remove_key(style_d, ['style'])

# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    pp.pprint( style_dl[0:5] )

# get audio file duration

In [None]:
# --- 2021/06/29: new to get the exact audio duration directy from the files
do_parse_all_audio_files = False

if do_parse_all_audio_files:
    
    def get_audio_duration():
        dict_l = []
        fullpathaudio_l = glob.glob(audio_DIR1) + glob.glob(audio_DIR2)
        for idx, fullpathaudio in enumerate(tqdm(fullpathaudio_l)):
            y, sr = librosa.load(fullpathaudio, sr=None)
            duration = len(y)/sr
            dict_d = {'file':fullpathaudio, 'duration':duration}
            dict_l.append(dict_d)
        return dict_l

    audio_duration_dl = get_audio_duration()

    with open(audioduration_FILE, 'w') as fid:
        json.dump(audio_duration_dl, fid, indent=4)

else:
    with open(audioduration_FILE) as fid:
        audio_duration_dl = json.load(fid)

# --- CREATE A MAP
map_duration_d = {}
for audio_duration_d in audio_duration_dl:
    audio_ID = audio_duration_d['file'].split('/')[-1].split('.')[0]
    map_duration_d[audio_ID] = audio_duration_d['duration']
    if audio_duration_d['duration']<100: print(audio_duration_d['duration'])

# --- DO THE MAPPING
for style_d in style_dl:
    if not 'metadata' in style_d.keys(): style_d['metadata'] = {}
    style_d['metadata']['track_duration'] = map_duration_d[ style_d['file'] ]
    
# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    pp.pprint(style_dl[0:5])

# list audio files

In [None]:
if do_localaudio:
    # --- Disk seagate metal
    fullpathaudio_l = glob.glob(audio_DIR1) + glob.glob(audio_DIR2)
    
    # --- Get root
    rootaudio_map = {}
    for fullpathaudio in fullpathaudio_l:
        rootaudio_map[ fullpathaudio.split('/')[-1].split('.')[0] ] = fullpathaudio
        
    # --- Map
    for style_d in style_dl:
        style_d[audio_TKN] = rootaudio_map[style_d['file']]
    
    # +++++++++++++++++++++++++++++++++++++++
    if do_verbose:
        print('number of audio files: ', len(fullpathaudio_l) )
        pp.pprint( style_dl[0] )

# segmentation file

File
- DTL_1000_segmentations.csv

Each entry is a segment (annotated in melody or not).

- 7084 segments (all the segments, only some of them have a melody annotation)
- coming from 1060 unique files

In [None]:
segment_dl = convert_dict( read_csv(segment_FILE, separator=';') )

# --- convert to float or int
for segment_d in segment_dl:
    segment_d['file'] = segment_d['file'].replace('.csv', '')
    segment_d['onset'] = float(segment_d['onset'])
    segment_d['duration'] = float(segment_d['duration'])
    segment_d['total_duration'] = float(segment_d['total_duration'])
    segment_d['number_segments'] = int(segment_d['number_segments'])
    
# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    print('number of entry: ', len(segment_dl))
    print('number of unique key: ', len(get_unique(segment_dl, 'file')))
    pp.pprint(segment_dl[0])

## Mapping filename JE*** -> AQA***

**Problem:** seg_dl['file'] is refering both to 
- `AQAYpouYjFF0ZFWPdtmR8AI84pmCw8dD` and 
- `JE-2-047-06` files 

$\rightarrow$ we need to map the 'JE-2-047-06' to their equivalent 'AQA***'.

In [None]:
segment_dl[6223]['file']

**We do this using the following filename-to-filename map**

In [None]:
# --- read map filename
mapfilename_dl = convert_dict( read_csv(mapfilename_FILE, separator=',') )

# --- create a map
mapfilename_map = {}
for mapfilename_d in mapfilename_dl:
    mapfilename_map[ mapfilename_d['file']] = mapfilename_d['dtl_id_short']

# --- convert
for segment_d in segment_dl:
    if segment_d['file'] in mapfilename_map.keys():
        print('converted file:', segment_d['file'] )   
        segment_d['file'] = mapfilename_map[segment_d['file']]
    else:
        print('X unable to map file "%s":' % (segment_d['file']) )   
        
# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    pp.pprint(mapfilename_dl[0])

## Mapping segment_dl -> style_dl

We attach segment_dl (the segments) to the style_dl (the top structure).

In [None]:
style_dl = map_data(style_dl, 'file', segment_dl, 'file', segment_segment_TKN, do_replace=False)

# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    pp.pprint( style_dl[1] )

**Clean and move fields**

In [None]:
for idx, style_d in enumerate(style_dl):
    assert segment_segment_TKN in style_d.keys(), print(idx, style_d)
    for segment_d in style_d[segment_segment_TKN]:
        style_d[segment_metadata_TKN] = {}
        copy_key(segment_d, style_d[segment_metadata_TKN], ['file_id', 'total_duration', 'number_segments'])
        #remove_key(segment_d, ['file', 'file_id', 'total_duration', 'number_segments'])
        remove_key(segment_d, ['file_id', 'total_duration', 'number_segments'])
        
# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    pp.pprint(style_dl[2])

# metadataANDsolo file

This file contains the meta-data (artist title, album title, and the link to the melody annotation file).

- 1685 segments with meta-data
- HOWEVER there is 1732 .csv file of melody annotation -> (1732-1685) of them do not have meta-data

In [None]:
metadataANDsolo_dl = convert_dict( read_csv(metadataANDsolo_FILE, separator=';') )

# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    print('number of entry: ', len(metadataANDsolo_dl))
    pp.pprint(metadataANDsolo_dl[0])

## Add pitch-csv content to metadata

In [None]:
import os.path

if do_pitchcsv:
    for metadataANDsolo_d in metadataANDsolo_dl:
        filename = pitch_DIR + metadataANDsolo_d['solo_id']  + '.csv'

        if os.path.exists(filename):
            pitch_dl = convert_dict( read_csv(filename, separator=',') )

            for pitch_d in pitch_dl:
                pitch_d['not_onset'] = float(pitch_d['onset'])
                pitch_d['not_duration'] = float(pitch_d['duration'])
                pitch_d['not_pitch'] = int(pitch_d['pitch'])
                pitch_d['is_phrase_start'] = int(pitch_d['phrasbeg'])
                pitch_d = remove_key(pitch_d, ['dtl_id_short', 'base_file', 'start', 'end', 'onset', 'duration', 'pitch', 'phrasbeg'])

            metadataANDsolo_d[pitch_TKN] = pitch_dl
        
        else:
            print('!!!', filename, 'does not exist')
            
    # +++++++++++++++++++++++++++++++++++++++
    if do_verbose:
        True #pp.pprint(metadataANDsolo_dl[0])

## Mapping metadataANDsolo_dl -> style_dl

In [None]:
# --- the mapping is based on a short version of 'solo_id' 
#    -> temparory store it as 'solo_id_short'
for metadataANDsolo_d in metadataANDsolo_dl: 
    metadataANDsolo_d['solo_id_short'] = metadataANDsolo_d['solo_id'][:32]
#    -> map
style_dl = map_data(style_dl, 'file', metadataANDsolo_dl, 'solo_id_short', metadataANDsolo_solo_TKN, do_replace=False)
#    -> then remove
for metadataANDsolo_d in metadataANDsolo_dl: 
    metadataANDsolo_d.pop('solo_id_short')
    
# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    pp.pprint(style_dl[1])

**Clean and move fields**

In [None]:
for style_d in style_dl:
    
    
    if metadataANDsolo_solo_TKN in style_d.keys():
        style_d[metadataANDsolo_metadata_TKN] = {}
        for metadataANDsolo_d in style_d[metadataANDsolo_solo_TKN]:
            to_move_l = ['area','band_name','disk_title','session_date','leader_name','medium_record_number','medium_title','performer_names','session_date','track_title']
            copy_key(metadataANDsolo_d, style_d[metadataANDsolo_metadata_TKN], to_move_l)
            remove_key(metadataANDsolo_d, to_move_l)

            onset = convert_string_to_time(metadataANDsolo_d['solo_start'])
            offset = convert_string_to_time(metadataANDsolo_d['solo_end'])
            metadataANDsolo_d['onset'] = onset
            metadataANDsolo_d['duration'] = offset - onset
            remove_key(metadataANDsolo_d, ['solo_start', 'solo_end'])

        
# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    pp.pprint(style_dl[1])

# 2020/01/26 Add files from Simon 

## Files coming from Illinois (AQA***) comes with an extra set of meta-data stored as a set of json files.
This is threfore for 660/1060 files

In [None]:
csv1960_dl = []
for csv1960_FILE in csv1960_FILE_l:
    with open(csv1960_FILE) as fid: 
        tmp_dl = json.load(fid)
    for key in tmp_dl[0].keys(): 
        csv1960_dl.append( tmp_dl[0][key] )

# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    print('total nb files: ', len(csv1960_dl) )
    print('number of entry: ', len(csv1960_dl))
    print('number of unique key: ', len(get_unique(csv1960_dl, 'audioid')))
    pp.pprint(csv1960_dl[0])

In [None]:
def assign_key_value(input, key, value):
    if key in input.keys():
        if not input[key]==value:
            print('> key "{}" is already assigned to "{}", cannot assign it to "{}"'.format(key, input[key], value))
    else:
        input[key] = value

## create_musician_instrument_map

In [None]:
# --- 2021/06/29
# --- we need to get the dictionary

# --- 2021/07/15
# /Users/peeters/_work/_projet/_2020_DigThatLick/_dtl_sqlite/musicians.json

def create_musician_instrument_map(csv1960_dl):
    musician_map = {}
    instrument_map = {}
    for csv1960_d in csv1960_dl:
        entry_l = csv1960_d['musician_instrument']
        for entry in entry_l:
            assign_key_value(musician_map, entry['musician_name'], entry['musician_id'])
            assign_key_value(instrument_map, entry['instrument_name'], entry['instrument_id'])
    return musician_map, instrument_map
musician_map, instrument_map = create_musician_instrument_map(csv1960_dl)

In [None]:
with open(lord_sql_musiciansID_FILE) as fid:
    entry_dl = json.load(fid)

musician_map = {}
for entry_d in entry_dl:
    if entry_d['name'] in musician_map.keys():
        # --- if already exist -> convert to list
        if type(musician_map[entry_d['name']]) is int:
            tmp = musician_map[entry_d['name']]
            musician_map[entry_d['name']] = []
            musician_map[entry_d['name']].append(tmp)
        musician_map[entry_d['name']].append( entry_d['id'] )
    else:
        musician_map[entry_d['name']] = entry_d['id']
    

#key_l = [key for key in musician_map.keys()]
#for key in key_l:
#    pp.pprint( musician_map[ key ] )

**Map it**

In [None]:
style_dl = map_data(style_dl, 'file', csv1960_dl, 'audioid', csv1960_TKN, do_replace=True)

# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    pp.pprint(style_dl[0])

## *** NEW parse 'perfomer_names'

In [None]:
# --- 2021/06/29
# --- we need to split performer_names
count=0
for idx, style_d in enumerate(style_dl):
    if metadataANDsolo_metadata_TKN in style_d.keys():
        try:
            style_d[metadataANDsolo_metadata_TKN]['performers_instruments'] = parse_performer_names(style_d[metadataANDsolo_metadata_TKN]['performer_names'], musician_map, instrument_map)
        except:
            print(idx, style_d[metadataANDsolo_metadata_TKN]['performer_names'])
            

In [None]:
# --- to check the format
for idx, style_l in enumerate(style_dl):
    if metadataANDsolo_metadata_TKN in style_l.keys():
        print(idx, '---------------------------------------------')
        pp.pprint(style_l[metadataANDsolo_metadata_TKN])

## NEW get origin and track_number_on_disk

In [None]:
# --- 2021/06/29
origin_l = []
track_number_on_disk_l = []
for style_d in style_dl:
    if csv1960_TKN in style_d.keys():
        origin = style_d[csv1960_TKN]['filename'].split('/')[1]
        style_d['metadata']['origin'] = origin
        track_number_on_disk = style_d[csv1960_TKN]['filename'].split('/')[-1].split(' ')[0]
        style_d['metadata']['CD_track_number'] = track_number_on_disk
    else:
        style_d['metadata']['origin'] = 'jazz-encyclopedia'

# Original filepath

## NEW keep fingerprint field

In [None]:
new_dl = convert_dict( read_csv(originalfile_FILE, separator=',') )

if not do_fingerprint:
    for new_d in new_dl:
        new_d = remove_key(new_d, ['fingerprint'])

# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    pp.pprint(new_dl)

In [None]:
style_dl = map_data(style_dl, 'file', new_dl, 'audio_id', originalfile_TKN, do_replace=True)

# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    pp.pprint(style_dl[1][originalfile_TKN])

## NEW get decade

In [None]:
# --- 2021/06/29: get decade
decade_l = []
for style_d in style_dl:
    decade = style_d[originalfile_TKN]['filename'].split('/')[-2]
    if not 'metadata' in style_d.keys(): style_d['metadata'] = {}
    style_d['metadata']['decade'] = decade

# MPAL: raw disk-info

**Load**

In [None]:
mpal_dl = convert_dict( read_csv(mpal_FILE, separator=',') )

for mpal_d in mpal_dl: 
    mpal_d.pop('')

# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    print('number of entry: ', len(mpal_dl))
    print('number of unique key: ', len(get_unique(mpal_dl, 'Catalog #')))
    pp.pprint(mpal_dl[0:2])

**Map it**

In [None]:
mpal_map = convert_to_map(mpal_dl, 'Catalog #')

In [None]:
count1 = 0
count2 = 0
count3 = 0
for style_d in style_dl: 
    if csv1960_TKN in style_d.keys():
        if len(style_d[csv1960_TKN]):
            lookfor_key = style_d[csv1960_TKN]['labelid']
            if lookfor_key in mpal_map.keys():
                style_d[mpal_TKN] = mpal_map[lookfor_key]
                count3 = count3+1
            else:
                count1 = count1+1
    else:
        count2 = count2 + 1

# +++++++++++++++++++++++++++++++++++++++
if do_verbose:
    print('number of files w/o metadata19_targetkeyname: ', count2)
    print('number of files with metadata19_targetkeyname but no mpal: ', count1)
    print('number of matches', count3)

In [None]:
pp.pprint(style_dl[1])

# Meta-data for JE (< 1960)

2021/05/18


In [None]:
data_l = read_csv(JeCompleteIndex_FILE, separator=',')
header_l = data_l[0]

map_d = {'Aufnahmeort':'area', 
         'Artist':'band_name', 
         'Aufnahmedatum':'session_date', 
         'Title':'track_title', 
         'Besetzung':'performer_names',
         'CD': 'medium_record_number',
         'Track': 'track_title',
         'Boxnumber': '-Boxnumber',
         'Composer': '-Composer',
         '': '-'
        }
JE_d = {}
count_collection = 0
count_cd = 0
count_track = 0

for l in range(1, len(data_l)):
    if len(data_l[l][0])==0 and len(data_l[l][1])==0:
        # --- Blank line
        True
    elif len(data_l[l][0])>0 and len(data_l[l][3])==0:
        # --- Collection line
        collection_name = data_l[l][0]
        count_collection += 1
        count_cd = 0
        count_track = 0
        #print('Collection', data_l[l])
    elif len(data_l[l][0])>0 and len(data_l[l][1])==0 and len(data_l[l][3])>0:
        # --- CD line
        #print('CD', data_l[l])
        cd_id = data_l[l][0]
        cd_name = data_l[l][3]
        count_cd += 1
        count_track = 0
    else:
        # --- Track
        #print('Track', data_l[l])x
        entry_d = {}
        for ll in range(len(header_l)):
            #entry_d[header_l[ll]] = data_l[l][ll]   # --- Original
            entry_d[map_d[header_l[ll]]] = data_l[l][ll] # --- Translated to English
        
        entry_d['medium_title'] = cd_name
        entry_d['disk_title'] = 'The Encyclopedia of Jazz, ' + collection_name
        
        tmp = entry_d['session_date'].split('/')
        if len(tmp)>2:
            entry_d['session_date'] = tmp[2] + '-' + tmp[1] + '-' + tmp[0]
        #entry_d['performer_names'] = ', '.join(sorted(entry_d['performer_names'].strip('.').split(', ')))
        
        
        entry_d.pop('-Boxnumber')
        entry_d.pop('-Composer')
        entry_d.pop('-')
        
        count_track += 1
        entry_d['CD_track_number'] = count_track
        entry_d['CD_disk_number'] = count_cd
        entry_d['collection_number'] = count_collection

        str_cd = '{0:3d}'.format(count_cd).replace(' ', '0')
        str_track = '{0:2d}'.format(count_track).replace(' ', '0')
        key = 'JE-{}-{}-{}.wav'.format(count_collection, str_cd, str_track)
        JE_d[key] = entry_d

        if count_track==0: print(key, '|', collection_name, '|', cd_id, cd_name)

In [None]:
key_l = [key for key in JE_d.keys()]
idx = 100
print(key_l[idx], JE_d[key_l[idx]])
print(key_l)

In [None]:
for idx, style_d in enumerate(style_dl):
    tmp = style_d[originalfile_TKN]['filename'].split('/')[-1]
    if 'JE-' in tmp:
        style_d[JeCompleteIndex_TKN] = JE_d[tmp]

In [None]:
pp.pprint(style_dl[1058][JeCompleteIndex_TKN])
pp.pprint(style_dl[1058][metadataANDsolo_metadata_TKN])

## *** NEW parse 'perfomer_names'

In [None]:
# --- 2021/06/29
# --- we need to split performer_names
not_correct_l = []
count=0
for idx, style_d in enumerate(style_dl):
    if JeCompleteIndex_TKN in style_d.keys():
        try:
            style_d[JeCompleteIndex_TKN]['performers_instruments'], tmp = parse_performer_names(style_d[JeCompleteIndex_TKN]['performer_names'], musician_map, instrument_map)
            not_correct_l.append(tmp)
        except:
            print(idx, '----', style_d[JeCompleteIndex_TKN]['performer_names'])
            
# --- to check the format
style_dl[19][JeCompleteIndex_TKN]

In [None]:
def flatten(t):
    return [item for sublist in t for item in sublist]
set(flatten(not_correct_l))

# NEW Music-Brainz

In [None]:
new_dl = convert_dict( read_csv(musicbrainz_FILE, separator=',') )
style_dl = map_data(style_dl, 'file', new_dl, 'audio_id', musicbrainz_TKN, do_replace=True)

In [None]:
for style_d in style_dl:
    pp.pprint(style_d[musicbrainz_TKN])

# NEW renaming

In [None]:
def my_rename_fields(entry_l, old_field, new_field):
    
    if not type(entry_l) is list:
        entry_l = [entry_l]
        
    for entry in entry_l:
        if old_field in entry.keys():
            if len(new_field):
                if new_field in entry.keys(): # --- there is already something in new_field -> we add
                    for old_field_key in entry[old_field].keys():
                        entry[new_field][old_field_key] = entry[old_field][old_field_key]
                else: # --- new_field does not exist -> we create it
                    entry[new_field] = entry[old_field]

            if not new_field == old_field:
                entry.pop(old_field)
    return

In [None]:
# --- IL / solo      1
# --- IL / w/o solo  0
# --- JE / solo      19
# --- JE / w/o solo  103
for num in range(1200):
    True #print(num, style_dl[num]['metadata']['origin'], 'solo_#metadata_full_compressed_v10_csv' in style_dl[num].keys())

In [None]:
#style_dl = [style_dl[idx] for idx in [1, 0, 19, 103]]

In [None]:
for style_d in style_dl:
    my_rename_fields(style_d, 'file', 'audio-ID')

    my_rename_fields(style_d, 'metadata_#styles_csv', '1_metadata')

    my_rename_fields(style_d['metadata_#musicbrainz'], 'audio_id', '')
    my_rename_fields(style_d, 'metadata_#musicbrainz', '2_metadata')

    ###style_d['metadata']['fingerprint'] = style_d['filename_original_#id_dtl1000-csv']['fingerprint']
    my_rename_fields(style_d, 'filename_original_#id_dtl1000-csv', '')


    category = 'segment_#DTL_1000_segmentations3_csv'
    if category in style_d.keys():
        my_rename_fields(style_d, category, 'structure')
        my_rename_fields(style_d['structure'], 'file', '')
        my_rename_fields(style_d['structure'], 'onset', 'segment_start')
        my_rename_fields(style_d['structure'], 'duration', 'segment_duration')
        my_rename_fields(style_d['structure'], 'instrument', 'solo_instrument')
        my_rename_fields(style_d['structure'], 'num_instruments', 'solo_instrument_count')
        my_rename_fields(style_d['structure'], 'main_type', 'segment_category')
        my_rename_fields(style_d['structure'], 'has_improvised', 'is_improvised')
        my_rename_fields(style_d['structure'], 'usable', 'is_usable')
        my_rename_fields(style_d['structure'], 'segment_pos', 'segment_number')
        my_rename_fields(style_d['structure'], 'segment_id', 'segment_ID')

    category = 'metadata_#DTL_1000_segmentations3_csv'
    if category in style_d.keys():
        my_rename_fields(style_d[category], 'number_segments', 'segment_count')
        my_rename_fields(style_d[category], 'total_duration', '')
        my_rename_fields(style_d[category], 'file_id', 'dtl1000_file_number')
        my_rename_fields(style_d, category, 'metadata_#from-structure')

    # --- seulement pour solo
    category = 'solo_#metadata_full_compressed_v10_csv'
    if category in style_d.keys():
        my_rename_fields(style_d, category, 'solo')
        my_rename_fields(style_d['solo'], 'solo_id', 'solo_ID')
        my_rename_fields(style_d['solo'], 'possible_solo_performer_names', '')
        my_rename_fields(style_d['solo'], 'instrument_label', 'solo_instrument')
        my_rename_fields(style_d['solo'], 'onset', 'solo_start')
        my_rename_fields(style_d['solo'], 'duration', 'solo_duration')
        my_rename_fields(style_d['solo'], 'solo_transcription', '') ###

    # --- seulement pour solo
    category = 'metadata_#metadata_full_compressed_v10_csv'
    if category in style_d.keys():
        my_rename_fields(style_d[category], 'area', 'session_location')
        my_rename_fields(style_d[category], 'disk_title', 'album_title')
        my_rename_fields(style_d[category], 'leader_name', 'band_leader')
        my_rename_fields(style_d[category], 'medium_record_number', 'CD_disc_number')
        my_rename_fields(style_d[category], 'medium_title', 'CD_disc_title')
        my_rename_fields(style_d[category], 'performer_names', 'performers_instruments_unparsed')
        my_rename_fields(style_d, category, 'metadata_#from-solo')

    # --- seulement pour Illinois
    category = 'metadata_#19**s.csv_110_musinstr_json'
    if category in style_d.keys():
        my_rename_fields(style_d[category], 'album', '')
        my_rename_fields(style_d[category], 'labelid', '')
        #my_rename_fields(style_d[category], 'trackname', '')
        my_rename_fields(style_d[category], 'release_id', 'Lord_release_ID')
        my_rename_fields(style_d[category], 'session_full_id', 'Lord_session_ID')
        my_rename_fields(style_d[category], 'session_id', '')
        my_rename_fields(style_d[category], 'tune_id', 'Lord_tune_ID')
        my_rename_fields(style_d[category], 'filename', '')
        #my_rename_fields(style_d[category], 'title', '')
        my_rename_fields(style_d[category], 'segmentid', '')
        my_rename_fields(style_d[category], 'audioid', '')
        my_rename_fields(style_d[category], 'musician_instrument', 'performers_instruments')
        my_rename_fields(style_d[category], 'time_location', 'session_location_date')
        my_rename_fields(style_d[category], 'track_id', 'track_ID')
        my_rename_fields(style_d, category, 'metadata_#from-1960csv')

    # --- seulement pour Illinois
    category = 'metadata_#MPAL_CD_List_csv'
    if category in style_d.keys():
        my_rename_fields(style_d[category], 'Catalog #', 'CD_ID')
        my_rename_fields(style_d[category], 'Artist', 'CD_artist')
        my_rename_fields(style_d[category], 'Album Title', 'CD_title')
        my_rename_fields(style_d[category], '# Discs', 'CD_disc_count')
        my_rename_fields(style_d[category], '2621', '')
        my_rename_fields(style_d[category], '30377', '')
        my_rename_fields(style_d, category, 'metadata_#from-MPAL')

    # --- seulement pour Jazz-Encyclopedia
    category = 'metadata_#JE_Complete_Index_csv'
    if category in style_d.keys():
        #my_rename_fields(style_d[category], 'track_title', '')
        my_rename_fields(style_d[category], 'medium_record_number', '')
        #my_rename_fields(style_d[category], 'band_name', '')
        my_rename_fields(style_d[category], 'area', 'sesson_location')
        #my_rename_fields(style_d[category], 'session_date', '')
        my_rename_fields(style_d[category], 'performer_names', 'performers_instruments_unparsed')
        my_rename_fields(style_d[category], 'medium_title', 'CD_disc_title')
        my_rename_fields(style_d[category], 'disk_title', '')
        my_rename_fields(style_d[category], 'collection_number', '')
        my_rename_fields(style_d, category, 'metadata_#from-JE-complete')

In [None]:
pp.pprint(style_d)

# >> Save the main file

Sort field by alphabetical order

In [None]:
from collections import OrderedDict
for l in range(len(style_dl)):
    style_dl[l] = OrderedDict(sorted(style_dl[l].items()))

In [None]:
from datetime import datetime
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d-%Hh-%Mm-%Ss")

In [None]:
store_d = {}
store_d['version'] = dt_string
store_d['collection'] = style_dl
with open(output_FILE, 'w') as fid:
    json.dump(store_d, fid, indent=4)

In [None]:
len(style_dl)

In [None]:
zadazdazdz

# Some stats

In [None]:
sub_dl = [style_d for style_d in style_dl if '/JE-' in style_d['filename_original_#id_dtl1000-csv']['filename']]
len(sub_dl)

In [None]:
key = 'filename_original_#id_dtl1000-csv' # 400/400 - 660/660

key = 'metadata_#19**s.csv_110_musinstr_json'   # 0/400 - 660/660
key = 'metadata_#MPAL_CD_List_csv' # 0/400 - 660/660

key = 'segment_#DTL_1000_segmentations_csv' # 400/400 - 660/660

key = 'metadata_#metadata_full_compressed_v9_csv'
#key = 'segment_solo_#metadata_full_compressed_v7_csv'

select_dl = [style_d for style_d in sub_dl if key in style_d.keys()]
len(select_dl)
#select_dl[0]

# Generate description

In [None]:
fillin = '-----'
def describe(data, fillin_tot = ''):
    if type(data) is list:
        print("{}{}".format(fillin_tot, '[:]'))
        if len(data):
            describe(data[0], fillin_tot + fillin)
    elif type(data) is dict:
        for key in data.keys():
            if type(data[key]) in (str, float, int):
                print("{}'{}'\t{}".format(fillin_tot, key, data[key]))
            else:
                print("{}'{}'".format(fillin_tot, key))
            describe(data[key], fillin_tot + fillin)
    else:
        True
        #print(fillin + 'value')

In [None]:
for idx, style_d in enumerate(style_dl):
    if 'segment_solo_#metadata_full_compressed_v7_csv' in style_d.keys():
        if len(style_d['segment_solo_#metadata_full_compressed_v7_csv'][0]['solo_performer_name']):
            print('')
            print(idx)
            describe(style_d)

In [None]:
describe(style_dl[29])

In [None]:
style_dl[1]

In [None]:
type(61)