In [None]:
from __future__ import division

import json, pickle
from glob import glob
from os.path import exists, expanduser
from scipy.stats import variation, skew, kurtosis

In [None]:
%matplotlib tk
rcParams['figure.figsize'] = (16.0, 9.0)

home_dir = expanduser("~")
features_dir = home_dir + '/data/datasets/20-raagas/features/'
dataset = json.load(file('{0}/data/datasets/20-raagas/dataset.json'.format(home_dir)))

# Svara positions

In [None]:
sys.path.append(home_dir+"/workspace/PhD/intonation/evaluation")
import svara_positions as sp
reload(sp)

In [None]:
raaga_svara = json.load(file('{0}/data/raaga-svara.json'.format(home_dir)))
svara_cents = json.load(file('{0}/data/svara-cents.json'.format(home_dir)))

ground_truth = {}
#raaga to cent-positions
for raaga in raaga_svara:
    svaras = set(raaga['arohana']).union(raaga['avarohana'])
    ground_truth[raaga['rid']] = {'name': raaga['proper_name'],
                                  'svara_positions': [svara_cents[svara] for svara in svaras]}

## Histograms

In [None]:
histogram_truth = {}

for rid, rdata in dataset.items():
    histogram_truth[rid] = {}
    for work_id, work_info in rdata['works'].items():
        for rec in work_info['recordings']:
            mbid = rec['mbid']
            try:
                params = pickle.load(file('{0}/histograms/{1}-params.pickle'.format(features_dir, mbid)))
            except IOError:
                print mbid
            positions = [i for i in params.keys() if 0 <= i < 1200]
            histogram_truth[rid][mbid] = sorted(positions)

In [None]:
histogram_eval = sp.compare_to_groundtruth(histogram_truth, ground_truth)
[p, r, f] = sp.eval_measures(histogram_eval)
print p, r, f

### Debugging

**Notes**:

* Tonic errors are shifting all the peaks to lower/higher octaves
 - Non-octave error
   * 0a33478e-d028-4f04-83a0-a9363adae895
   * 64a4af23-ad15-4768-a9f1-252145b04dae
 - f2dc7763-0166-4fe9-830c-53b7953c2998 (very wierd)
 - e151896b-73ff-402f-a1bd-f64d5d99d3a6 (also wierd)
 - d42249e4-8889-4b34-a963-c99a674ddcf0
 - d2029d27-2e5f-42d1-9718-debf165f74fd
 - 43737160-f73a-4cc9-bfd6-be3a77600663
 - f70ca71a-af1f-400e-a597-1a409500a946
 - e27d0605-1426-45a4-b34f-a9520742cf53
 - and more...
* Unidentified bumps, *a limitation*
 - d2387cfd-6996-4201-a4c0-06c37043e5a3 (at 315 cents)
 - caa8efc0-cc5d-4cf6-9a83-96ca53721ffe (315)
* Error peaks due to liberal thresholds, *needs more fine tuning, redo the grid search from JNMR 2014, seems like 1e-04 for both work better though few valid peaks are missing (which can be compensated by choosing one peak per svara across octaves)*
 - A change in how we normalize has a significant impact on this, try various ways to normalize.
 - 70c49f3d-c339-4c4d-b29c-e38120711322 (both peak_amp_thresh and valley_thresh)
 - ecb3c485-ee24-4035-8a8a-3629b0a87673 (many peaks)
* In Atana (rid: 6), there are peaks at Anya svaras (G2, N2) and often the peaks at svaras G3 and R3 are missing
 - This can be one possible conclusion that can be drawn from the fact that there are recurrent false positives
 - Asaveri (rid: 5 might be the same)
* Raaga mislabels
* Why doesn't the second bump get detected as peak, when it has better valley_thresh and peak_amp_thresh than the other which is recognized?: 89f21abd-ef27-4856-9553-dc0d194b0c7e

In [None]:
%matplotlib inline
rcParams['figure.figsize'] = (16,8)

ji_intervals = json.load(file("{0}/data/svara-cents.json".format(home_dir)))
ji_intervals = unique(sorted(ji_intervals.values()))
ji_intervals = append(ji_intervals, ji_intervals[-12:]+1200)
ji_intervals = append(ji_intervals, ji_intervals[-12:]+1200)
ji_intervals = append(ji_intervals[:12]-1200, ji_intervals)

In [None]:
data = pickle.load(file(features_dir + 'histograms/cb280397-4e13-448e-9bd7-97105b2347dc.pickle'))
data.plot(intervals=ji_intervals)

## Context-based

In [None]:
context_based_truth = {}
positives = []
negatives = []

for rid, rdata in dataset.items():
    rid = int(rid)
    context_based_truth[rid] = {}
    for work_id, work_info in rdata['works'].items():
        for rec in work_info['recordings']:
            mbid = rec['mbid']
            pitchclass_counts = sp.get_counts(mbid)
            if pitchclass_counts is None:
                print mbid
                continue
            #Bound based on the significance of reletive counts
            pos, neg = sp.segregate(pitchclass_counts, ground_truth[rid]['svara_positions'])
            positives.extend(pos)
            negatives.extend(neg)
            
            #The bound is somewhere around 0.25 normalized by max count.
            positions = sp.get_legit_svaras(pitchclass_counts, norm_count_thresh=0.25)
            context_based_truth[rid][mbid] = positions

In [None]:
context_based_eval = sp.compare_to_groundtruth(context_based_truth, ground_truth)
[p, r, f] = sp.eval_measures(context_based_eval)
print p, r, f

### Debugging

In [None]:
rid = 2
print ground_truth[rid]['name']

tps = []
fps = []
fns = []

for rec in histogram_eval[rid]:
    print rec[0], '\t',
    for s in rec[1]:
        print len(s), '\t',
    tps.append(len(rec[1][0]))
    fps.append(len(rec[1][1]))
    fns.append(len(rec[1][2]))
    print

tps = sum(tps)
fps = sum(fps)
fns = sum(fns)
p = tps/(tps+fps)
r = tps/(tps+fns)
print 'Precision: ', p
print 'Recall: ', r
print 'F1-score: ', 2*p*r/(p+r)

# Raaga classification

In [None]:
def get_svara_data(pickle_file, pitch_data):
    svara_data = {}
    try:
        contours = pickle.load(file(pickle_file))
        for k, v in contours.items():
            pitch_contours = []
            for i in v:
                temp = pitch_data[i[0]:i[1], 1]
                temp = temp[invert(isinf(temp))]
                pitch_contours.append(temp)
            svara_data[k] = [concatenate(pitch_contours)]
            
    except (IOError):
        #print pickle_file + 'not found!'
        return False
    
    return svara_data

In [None]:
def get_parameters(svara_data):
    parameters = {}
    for svara, distribution in svara_data.items():
        if svara < 0 or svara >= 1200:
            continue
            
        [n, be] = np.histogram(distribution, bins=1200)
        bc = (be[1:] + be[:-1])/2.0
        
        peak_pos = bc[argmax(n)]
        peak_mean = float(mean(distribution))
        peak_variance = float(variation(distribution, axis=1)[0])
        peak_skew = float(skew(distribution, axis=1)[0])
        peak_kurtosis = float(kurtosis(distribution, axis=1)[0])
        pearson_skew = float(3.0 * (peak_mean - peak_pos) / np.sqrt(abs(peak_variance)))
        parameters[svara] = {"position": float(peak_pos),
                                "mean": peak_mean,
                                "amplitude": float(max(n)),
                                "variance": peak_variance,
                                "skew1": peak_skew,
                                "skew2": pearson_skew,
                                "kurtosis": peak_kurtosis}
        
    all_amps = [parameters[svara]["amplitude"] for svara in parameters.keys()]
    peak_amp_sum = sum(all_amps)
    for svara in parameters.keys():
        parameters[svara]["amplitude"] = parameters[svara]["amplitude"]/peak_amp_sum

    return parameters

## Compute parameters for Varnam dataset

In [None]:
#data_dir = '/homedtic/gkoduri/data/intonation/varnam-analysis/recorded/audio/
data_dir = '/home/gkoduri/Dropbox/UPF-Work/PhD/Varnam Analysis/data/audioScoreAlignment/'
raagas = ['abhogi', 'begada', 'kalyani', 'mohanam', 'sahana', 'saveri', 'shree']

In [None]:
chdir(data_dir)
methods = ['semiautomatic', 'context_based']

for raaga in raagas:
    print raaga
    chdir(raaga)
    
    artists = listdir('.')
    artists = [a for a in artists if isdir(a)]
    
    # For each artist, for each svara, aggregate all the pitch values corresponding to the contour indices
    for artist in artists:
        print artist, 
        
        try:
            pitch_data = loadtxt(data_dir + raaga + '/' + artist + '/' + artist + '-cents.txt')
        except(IOError):
            continue
        
        for method in methods:
            pickle_file = data_dir+raaga+'/'+artist+'/contours_'+method+'.pickle'
            svara_data = get_svara_data(pickle_file, pitch_data)
            if not svara_data: continue
            
            parameters = get_parameters(svara_data)
            
            if not exists(data_dir + '../parameters/' + method):
                mkdir(data_dir + '../parameters/' + method)    
            pickle.dump(parameters, file(data_dir + '../parameters/' + method + '/' + raaga + '_' + artist + '.pickle', 'w'))
    print
    chdir('..')

In [None]:
chdir(data_dir)
method = 'phrase_aligned'

for raaga in raagas:
    print raaga
    chdir(raaga)
    
    artists = listdir('.')
    artists = [a for a in artists if isdir(a)]
    
    # For each artist, for each svara, aggregate all the pitch values corresponding to the contour indices
    for artist in artists:
        print artist,
        try:
            chdir('{0}/contours_phrase_aligned'.format(artist))
        except(OSError):
            print ' .. does not have contours!'
            continue

        try:
            pitch_data = loadtxt(data_dir + raaga + '/' + artist + '/' + artist + '-cents.txt')
        except(IOError):
            print '.. does not have pitch file!'
            continue
            
        pickled_files = glob('*')
        
        for f in pickled_files:
            pickle_file = data_dir+raaga+'/'+artist+'/contours_phrase_aligned/'+f
            svara_data = get_svara_data(pickle_file, pitch_data)
            if not svara_data: 
                print ' .. wrong!'
                continue
            
            parameters = get_parameters(svara_data)
            
            if not exists(data_dir + '../parameters/' + method):
                mkdir(data_dir + '../parameters/' + method)
            if not exists(data_dir + '../parameters/' + method + '/' + f[:-7]):
                mkdir(data_dir + '../parameters/' + method + '/' + f[:-7])
            pickle.dump(parameters, file(data_dir + '../parameters/' + method + '/' + f[:-7] + '/' + raaga + '_' + artist + '.pickle', 'w'))
        
        chdir('../..')
        
    print
    chdir('..')

In [None]:
def toWeka(pathGiven):
    print pathGiven
    parts = pathGiven.rstrip("/").split("/")
    parentDir = "/".join(parts[:-1])
    
    arffFilename = "/".join([parentDir, parts[-1] + ".arff"])
    wekafile = file(arffFilename, "w+")

    all_labels = ['abhogi', 'begada', 'kalyani', 'mohanam', 'sahana', 'saveri', 'shree']
    svaras = pickle.load(file('/home/gkoduri/Dropbox/UPF-Work/PhD/Varnam Analysis/data/cents_to_svara_labels.pickle'))
    svaras = sort(svaras.keys())
    params = ['position', 'mean', 'amplitude', 'variance', 'skew1', 'skew2', 'kurtosis']
    
    descriptorList = []
    for s in svaras:
        for p in params:
            descriptorList.append(p + '_' + str(s))
    attr_list = ""
    for i in descriptorList:
        attr_list = attr_list + "@ATTRIBUTE " + i + " REAL\n"
    attr_list = attr_list + "\n@ATTRIBUTE segment {" + ", ".join(all_labels) + "}\n\n@DATA\n"
    
    wekafile.write("@RELATION raagaID\n\n")
    wekafile.write(attr_list)
    
    extension = ".pickle"
    fnames = listdir(pathGiven)
    
    for fname in fnames:
        if fname.endswith(extension):
            class_label = fname[:-7].split('_')[0]
            fname = pathGiven + "/" + fname
            parameters = pickle.load(file(fname))

            #write the data points
            data_entry = ""
            for s in svaras:
                if s in parameters.keys():
                    for p in params:
                        data_entry = data_entry + str(parameters[s][p]) + ', '
                else:
                    for p in params:
                        data_entry = data_entry + '-10000, '
            data_entry = data_entry + class_label
            wekafile.write(data_entry + "\n")

In [None]:
toWeka(data_dir + '../parameters/context_based')

In [None]:
toWeka(data_dir + '../parameters/semiautomatic')

In [None]:
configs = glob('{0}/../parameters/phrase_aligned/*'.format(data_dir))

for config in configs:
    toWeka(config)

## Compute parameters for Kriti dataset

In [None]:
from os.path import basename

In [None]:
feat_dir = '/homedtic/ssenturk/experiments/20-raagas/features/'
contour_dir = '/homedtic/ssenturk/experiments/20-raagas/features/phrase_aligned/'
align_dir = '/homedtic/ssenturk/experiments/20-raagas/features/noteAlignments/dtw_100centBinarization_kmeans/'

In [None]:
contour_files = glob('{0}/*.pickle'.format(contour_dir))

In [None]:
for f in contour_files:
    if exists('{0}/{1}'.format(contour_dir, f.replace('contours', 'params'))):
        continue
        
    mbid = basename(f)[:-16]
    try:
        pitch_data = loadtxt('{0}/pitch/{1}.txt'.format(feat_dir, mbid))
    except(IOError):
        print mbid, 'does not have pitch file!'
        continue
    svara_data = get_svara_data(f, pitch_data)
    if not svara_data: 
        print f, 'cannot be read!'
        continue

    parameters = get_parameters(svara_data)
    pickle.dump(parameters, file(f.replace('contours', 'params'), 'w'))
    print f, 'is successful'

In [None]:
from os import listdir
from os.path import basename
from glob import glob

In [None]:
def toWeka(pathGiven, mbids, mbid_raagas):
    print pathGiven
    parts = pathGiven.rstrip("/").split("/")
    parentDir = "/".join(parts[:-1])
    
    arffFilename = "/".join([parentDir, parts[-1] + ".arff"])
    wekafile = file(arffFilename, "w+")

    all_labels = [mbid_raagas[mbid] for mbid in mbids]
    all_labels = sorted(unique(all_labels))
    
    svaras = json.load(file('/homedtic/gkoduri/data/svara-cents.json'))
    svaras = array(svaras.values())
    svaras = svaras[svaras >= 0]
    svaras = svaras[svaras < 1200]
    svaras = sort(unique(svaras))
    params = ['position', 'mean', 'amplitude', 'variance', 'skew1', 'skew2', 'kurtosis']
    
    descriptorList = []
    for s in svaras:
        for p in params:
            descriptorList.append(p + '_' + str(s))
    attr_list = ""
    for i in descriptorList:
        attr_list = attr_list + "@ATTRIBUTE " + i + " REAL\n"
    attr_list = attr_list + "\n@ATTRIBUTE segment {" + ", ".join(all_labels) + "}\n\n@DATA\n"
    
    wekafile.write("@RELATION raagaID\n\n")
    wekafile.write(attr_list)
    
    extension = "-params.pickle"
    for mbid in mbids:
        fname = '{0}/{1}{2}'.format(pathGiven, mbid, extension)
        class_label = mbid_raagas[mbid]
        parameters = pickle.load(file(fname))

        #write the data points
        data_entry = ""
        for s in svaras:
            if s in parameters.keys():
                for p in params:
                    data_entry = data_entry + str(parameters[s][p]) + ', '
            else:
                for p in params:
                    data_entry = data_entry + '-10000, '
        data_entry = data_entry + class_label
        wekafile.write(data_entry + "\n")

In [None]:
dataset = json.load(file('/homedtic/ssenturk/experiments/20-raagas/dataset.json'))

In [None]:
from unidecode import unidecode

### All MBIDS

In [None]:
mbid_raagas = {}
for rid, rdata in dataset.items():
    for wid, wdata in rdata['works'].items():
        for rec in wdata['recordings']:
            mbid_raagas[rec['mbid']] = unidecode(rdata['name'])

In [None]:
path = '/homedtic/ssenturk/experiments/20-raagas/features/phrase_aligned/'
mbids = glob('{0}/*-params.pickle'.format(path))
mbids = [basename(i) for i in mbids]
mbids = [i[:-len('-params.pickle')] for i in mbids]

In [None]:
toWeka(path, mbids, mbid_raagas)

In [None]:
path = '/homedtic/ssenturk/experiments/20-raagas/features/context_based/'
toWeka(path, mbids, mbid_raagas)

### Selected MBIDS

In [None]:
len(mbids)

In [None]:
raaga_mbids = {}
for mbid in mbids:
    raaga = mbid_raagas[mbid]
    if raaga in raaga_mbids.keys():
        raaga_mbids[raaga].append(mbid)
    else:
        raaga_mbids[raaga] = [mbid]

In [None]:
selected_mbids = []
n_thresh = 5
for r, rmbids in raaga_mbids.items():
    if len(rmbids) >= n_thresh:
        selected_mbids.extend(rmbids)

In [None]:
savetxt('/homedtic/gkoduri/ismir-mbids.txt', selected_mbids, fmt='%s', delimiter='\n')

In [None]:
path = '/homedtic/ssenturk/experiments/20-raagas/features/phrase_aligned/'
toWeka(path, selected_mbids, mbid_raagas)

In [None]:
path = '/homedtic/ssenturk/experiments/20-raagas/features/context_based/'
toWeka(path, selected_mbids, mbid_raagas)