In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import glob
import os.path
import pretty_midi
import pickle
import random

### Evaluate system performance

In [3]:
def readGroundTruthLabels(gtfile):
    d={}
    d1={}
    with open(gtfile, "r") as f:
        for line in f:
            data = line.split(',')
            key = data[0]
            d[key] = []
            d1[key] = key
            for idx,item in enumerate(data):
                if idx != 0 and item != "x":
                    try:
                        pieceNum = int(item)
                        d[key].append(pieceNum)
                    except:
                        pass
                    try:
                        pieceNum = int(item)
                        d1[str(pieceNum)] = key
                    except:
                        pass
    return d, d1

In [4]:
def readHypothesisFiles(hypdir, benchmark):
    l = []
    if benchmark == 1:
        for hypfile in sorted(glob.glob("{}/*.pkl".format(hypdir))):
            with open(hypfile, "rb") as f:
                l.append([os.path.splitext(os.path.basename(hypfile))[0],pickle.load(f)])
    elif benchmark == 0 or benchmark == 2:
        for hypfile in sorted(glob.glob("{}/*.hyp".format(hypdir))):
            print(hypfile)
            with open(hypfile, "rb") as f:
                l.append(pickle.load(f))
    elif benchmark == 3:
        for hypfile in sorted(glob.glob("{}/*.hyp".format(hypdir))):
            print(hypfile)
            with open(hypfile, "rb") as f:
                data = pickle.load(f)
                pieceScores = [(x[1],x[0]) for x in data[1]]
                l.append((data[0],pieceScores))
    return l

In [5]:
s = set([])
def collapseIds(pieceScores):
    pieceScores = list(dict.fromkeys(pieceScores))
    return pieceScores
        

In [6]:
def getRank(pieceScores,gt,idt,queryid,benchmark, condition):
    query = queryid.split('_')[0]
    l = []
    count = 0
    if benchmark:
        pieceScores = collapseIds(pieceScores)
    rank = 300000
    pklfile = 'piece_to_num.pkl'
    with open (pklfile,'rb')as f:
        piece_to_num = pickle.load(f)
    for i in range(len(pieceScores)):
        if benchmark == 1 or benchmark == 2:
            num = pieceScores[i].split("_")[0]
            if 'p' in num:
                if condition == 2:
                    continue
                if query.split("_")[0] == num:
                    rank = count+1
                    return rank
            elif int(num) in gt[query]:
                rank = count+1
                return rank
            count = count+1            
        else:
            pieceName = piece_to_num[pieceScores[i][0]]
            if pieceName[0]=='p':
                if condition == 2:
                        continue
                if(pieceName==queryid.split('_')[0]):
                    rank=count+1
                    return rank
                if not pieceName in idt.keys():
                    count+=1
                elif not idt[pieceName] in l:
                    count+=1
                    l.append(idt[idt[pieceName]])
            else:
                try:
                    if (int(pieceScores[i][0].split("**")[-1]) in gt[query]):
                        rank = count+1
                        return rank

                    if not str(int(pieceScores[i][0].split("**")[-1])) in idt.keys():
                        count+=1
                    elif not idt[str(int(pieceScores[i][0].split("**")[-1]))] in l:
                        count+=1
                        l.append(idt[str(int(pieceScores[i][0].split("**")[-1]))])
                except:
                    pass
    return rank

In [7]:
def calcPrecisionRecall(hypdir, gtfile, benchmark = False, condition = 1):
    hyps = readHypothesisFiles(hypdir, benchmark)
    gt,idt = readGroundTruthLabels(gtfile)
    MRR = 0
    runtimes = []
    MRRs = []
    if condition == 2:
        valid = []
        with open('condition2.txt','r') as f:
            for line in f:
                valid.append(line.strip())
                
    hyps = sorted(hyps, key = lambda x: int(x[0].split("_")[0][1:])*100+int(x[0].split("_")[1][1:]))
    count = 0
    if np.shape(hyps)[0] == 2:
        for queryid, pieceScores in hyps:
            if condition == 2:
                if not queryid.split('_')[0].strip() in valid:
                    count+=1
                    continue
            rank = getRank(pieceScores,gt,idt,queryid,benchmark, condition)
            MRRs.append((queryid,rank))
            MRR=MRR+1/(rank)
    else:
        for queryid, pieceScores, runtime in hyps:
            if condition == 2:
                if not queryid.split('_')[0].strip() in valid:
                    count+=1
                    continue
            rank = getRank(pieceScores,gt,idt,queryid,benchmark, condition)
            MRRs.append((queryid,rank))
            runtimes.append(runtime)
            MRR=MRR+1/(rank)
    return MRR/len(MRRs),MRRs, runtimes

In [8]:
def calcOverlap(seg1, seg2):
    overlap_lb = max(seg1[0], seg2[0])
    overlap_ub = min(seg1[1], seg2[1])
    overlap = np.clip(overlap_ub - overlap_lb, 0, None)
    return overlap    

In [9]:
hypdir = 'experiments/Analysis_2_GRAM/train'
condition2 = "condition2.txt"

#0 is normal
#1 is old paper
#2 is new paper
#3 is weird test
benchmark = 0
queryGTFile = 'piece_To_id.csv'
gt, idt = readGroundTruthLabels(queryGTFile)
MRR, MRRs, runtimes = calcPrecisionRecall(hypdir, queryGTFile, benchmark, condition = 1)

experiments/Analysis_2_GRAM/train/p101_q1.hyp
experiments/Analysis_2_GRAM/train/p101_q10.hyp
experiments/Analysis_2_GRAM/train/p101_q2.hyp
experiments/Analysis_2_GRAM/train/p101_q3.hyp
experiments/Analysis_2_GRAM/train/p101_q4.hyp
experiments/Analysis_2_GRAM/train/p101_q5.hyp
experiments/Analysis_2_GRAM/train/p101_q6.hyp
experiments/Analysis_2_GRAM/train/p101_q7.hyp
experiments/Analysis_2_GRAM/train/p101_q8.hyp
experiments/Analysis_2_GRAM/train/p101_q9.hyp
experiments/Analysis_2_GRAM/train/p105_q1.hyp
experiments/Analysis_2_GRAM/train/p105_q10.hyp
experiments/Analysis_2_GRAM/train/p105_q2.hyp
experiments/Analysis_2_GRAM/train/p105_q3.hyp
experiments/Analysis_2_GRAM/train/p105_q4.hyp
experiments/Analysis_2_GRAM/train/p105_q5.hyp
experiments/Analysis_2_GRAM/train/p105_q6.hyp
experiments/Analysis_2_GRAM/train/p105_q7.hyp
experiments/Analysis_2_GRAM/train/p105_q8.hyp
experiments/Analysis_2_GRAM/train/p105_q9.hyp
experiments/Analysis_2_GRAM/train/p111_q1.hyp
experiments/Analysis_2_GRAM/trai

  return array(a, dtype, copy=False, order=order)


In [None]:
keys = list(idt.keys())
print(keys)

In [10]:
MRR

0.9202252044126267

### Investigate Errors

In [11]:
def printDebuggingInfo(MRRs):
    for i, (queryid, rank) in enumerate(MRRs):
        print(queryid, rank)

In [12]:
printDebuggingInfo(MRRs)

p2_q1 1
p2_q2 1
p2_q3 1
p2_q4 1
p2_q5 1
p2_q6 1
p2_q7 1
p2_q8 1
p2_q9 1
p2_q10 1
p3_q1 1
p3_q2 1
p3_q3 1
p3_q4 1
p3_q5 1
p3_q6 1
p3_q7 1
p3_q8 1
p3_q9 1
p3_q10 1
p4_q1 1
p4_q2 1
p4_q3 1
p4_q4 1
p4_q5 1
p4_q6 1
p4_q7 1
p4_q8 1
p4_q9 1
p4_q10 1
p6_q1 1
p6_q2 1
p6_q3 1
p6_q4 1
p6_q5 1
p6_q6 1
p6_q7 1
p6_q8 1
p6_q9 1
p6_q10 1
p7_q1 1
p7_q2 1
p7_q3 1
p7_q4 1
p7_q5 1
p7_q6 1
p7_q7 1
p7_q8 1
p7_q9 1
p7_q10 1
p8_q1 1
p8_q2 1
p8_q3 1
p8_q4 1
p8_q5 1
p8_q6 1
p8_q7 1
p8_q8 1
p8_q9 1
p8_q10 1
p9_q1 1
p9_q2 1
p9_q3 1
p9_q4 1
p9_q5 1
p9_q6 1
p9_q7 1
p9_q8 1
p9_q9 1
p9_q10 1
p10_q1 1
p10_q2 1
p10_q3 1
p10_q4 1
p10_q5 1
p10_q6 1
p10_q7 1
p10_q8 1
p10_q9 1
p10_q10 1
p12_q1 1
p12_q2 1
p12_q3 1
p12_q4 1
p12_q5 1
p12_q6 1
p12_q7 1
p12_q8 1
p12_q9 1
p12_q10 300000
p13_q1 1
p13_q2 1
p13_q3 1
p13_q4 1
p13_q5 1
p13_q6 1
p13_q7 1
p13_q8 1
p13_q9 1
p13_q10 1
p14_q1 1
p14_q2 1
p14_q3 300000
p14_q4 1
p14_q5 300000
p14_q6 1
p14_q7 2
p14_q8 1
p14_q9 1
p14_q10 1
p16_q1 1
p16_q2 1
p16_q3 1
p16_q4 1
p16_q5 1
p16_q6 1


### Measure Runtime

In [11]:
def showRuntimeStats(durs):
    durs = []
    durs = np.array(durs)
    avgDur = np.mean(durs)
    minDur = np.min(durs)
    maxDur = np.max(durs)
    stdDur = np.std(durs)
    print('Avg Duration: {:.2f} sec'.format(avgDur))
    print('Std Duration: {:.2f} sec'.format(stdDur))
    print('Min Duration: {:.2f} sec'.format(minDur))
    print('Max Duration: {:.2f} sec'.format(maxDur))
    plt.hist(durs, bins=np.arange(0,2,.1))
    plt.xlabel('Runtime (sec)')
    plt.ylabel('Count')

In [11]:
print(np.mean(runtimes))

3.4768030458688735


In [12]:
showRuntimeStats(runtimes)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


ValueError: zero-size array to reduction operation minimum which has no identity

In [None]:
print(sorted(glob.glob("*.jpg")))

In [None]:
import os
list = ""
dir = 'data/queries'
for i in range(1,201):
    for j in range(1,11):
        list+=dir+'/p'+str(i)+'_q'+str(j)+'.jpg\n'
with open('cfg_files/query.total.list','w')as f:
    f.write(list)

## 新的测试程序

In [None]:
from Singel_Query import *

In [None]:
def getAverage(list):
    sum = 0
    for number in list:
        sum+=number
    return sum/len(list)

In [None]:
testlist = 'cfg_files/query.train.list'
pickle_file = 'experiments/indices/Dynamic_N_GRAM_ALL(2).pkl'
piecenumfile = 'piece_to_num.pkl'

with open(piecenumfile,'rb')as f:
    piece_to_num=pickle.load(f)

with open(pickle_file,'rb')as f:
    rindex = pickle.load(f)



In [None]:
MRRs = []
Runtimes = []
with open(testlist,'r') as f:
    for curfile in f:
        curfile = curfile.strip().strip('\n')
        pieceScores, runTime =  processSingleQuery(curfile,rindex,"Dynamic_Static")
        grade = 0
        i = 1
        curfile = curfile.split('/')[-1]
        for pieceScore in pieceScores:
            if pieceScore[1] != grade:
                grade = pieceScore[1]
                rank = i
            if piece_to_num[pieceScore[0]]== curfile[0:curfile.index('_')]:
                break
            i += 1
        MRR = 1/rank
        MRRs.append(MRR)
        Runtimes.append(runTime)

print("The MRR of test list is ",getAverage(MRRs))
print("The average running time is ",getAverage(Runtimes))

In [None]:
i = 0
with open(testlist,'r')as f:
    with open('log(5).list','w')as outfile:
        for curfile in f:
            outfile.write(curfile.strip('\n')+' '+str(MRRs[i])+'\n')
            i+=1

In [None]:
pieceScores, runTime =  processSingleQuery('data/queries/p2_q1.jpg',rindex,"Dynamic_Static")
rank = 1
grade = 0
i = 1
for pieceScore in pieceScores:
    if pieceScore[1] != grade:
        rank = i
    if piece_to_num[pieceScore[0]]== 'p2':
        break
    i += 1

MRR = 1/rank
print(MRR)
print(len(pieceScores))
for i in range(10):
    print(pieceScores[i])


In [None]:
list = 'log(5).list'
MRRs = []
with open(list,'r')as f:
    for curfile in f:
        MRRs.append(float(curfile.split()[1]))

print(len(MRRs))
print(getAverage(MRRs))

In [None]:
path = "experiments/Analysis/hyp"
pklfile = "piece_to_num.pkl"
ranks = []
with open (pklfile,'rb')as f:
    piece_to_num = pickle.load(f)

for home, dirs, files in os.walk(path):
    for file in files:
        with open(os.path.join(home,file),"rb")as f:
            hypfile = pickle.load(f)
            queryid = hypfile[0]
            pieceScores = hypfile[1]
            rank = 1
            grade = 0
            i =1
            for pieceScore in pieceScores:
                # if pieceScore[1] != grade:
                #     rank = i
                # if piece_to_num[pieceScore[0]]== queryid.split('_')[0]:
                #     break
                # i+=1
                pieceName = piece_to_num[pieceScore[0]]
                if pieceName == queryid.split('_')[0]:
                    break
                else:
                    rank+=1
            ranks.append(rank)

MRR = 0
for rank in ranks:
    MRR+=1/rank
MRR = MRR/len(ranks)
print(MRR)


In [None]:
print(piece_to_num)