# Model Evaluation using Combination of best Features

In [1]:
#import necessary libraries
from mido import MidiFile, MidiTrack, Message
from mido.midifiles.meta import MetaMessage
from scipy.spatial import distance
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
import mido
import os
import math
import numpy as np
import pandas as pd
import math
from sklearn import metrics
import statistics
import random
import librosa
from music21 import converter, corpus, instrument, midi, note, chord, pitch, stream, tempo
import os
import itertools
import csv

# Data Extraction from model evaluation dataset I

In [2]:
#read midis 
os.getcwd()
files_port = []
os.chdir("/home/girija/Documents/Project_Evaluation/Port/")
for root, dirs, files in os.walk("."):
    for file in files:
        files_port.append(mido.MidiFile(file))

In [3]:
#get duration/ note-type
duration_port = []
note_port = []
for mid in files_port:
    d = []
    n = []
    delta = 0
    for track in mid.tracks:
        for message in track:
            if not isinstance(message, MetaMessage):
                d.append(message.time)
                n.append(message.type)
    duration_port.append(d[6:])
    note_port.append(n[6:])

In [4]:
#get pitch value, duration and note type
pitch_port = []
for mid in files_port:
    p = []
    for track in mid.tracks:
        for message in track:
            message = str(message)
            lines = message.splitlines()
            for line in lines:
                if('note_off' in line or 'note_on' in line):
                    props = line.split(' ')
                    p.append(int(props[3].split('=')[1]))
    pitch_port.append(p)

<h3>Get Binary Values for duration

In [5]:
l1 = []
for duration in duration_port:
    size = np.sum(np.asarray(duration))
    l1.append(size)
    
minimum_bin = min(l1)

In [6]:
port_binary = []
for item in duration_port:
    tmp = np.zeros(minimum_bin)
    i = 0
    for duration in item:
        if(duration==1 and i<minimum_bin):
                tmp[i] = 1
        else:
            tmp[i+1:duration+1] = 0
        i+=duration
    port_binary.append(list(tmp))

<h3>Get pitch values (compared with average)

In [7]:
#Find minimum length 
l = []
idx = []
for mid in pitch_port:
    l.append(len(mid))

#This will be the final minimum value to which the tunes will be cropped irrespective of their genre.
minimum_pitch = min(l)

In [8]:
#crop the notes to minimum length 
input_data_port = []
for mid in pitch_port:
    input_data_port.append(mid[0:minimum_pitch])


In [9]:
#find vector of pitch values considering difference with average
diff_notes2p = []
for item in input_data_port:
    avg = statistics.mean(item)
    diff = []
    for note in item:
        diff.append(note-avg)
    diff_notes2p.append(diff)

In [10]:
#add padding so that size of vector is same as duration vector
p = minimum_bin - minimum_pitch
padding = list(np.zeros(p))

In [11]:
for i in range(len(diff_notes2p)):
    diff_notes2p[i] = diff_notes2p[i]+padding

<h3>Get beats

In [12]:
#Read files
os.getcwd()
files_port = []
os.chdir("/home/girija/Documents/Project_Evaluation/Port_wav/")
for root, dirs, files in os.walk("."):
    for file in files:
        files_port.append(file)
        
beats_port = []
for file in files_port:
    y, sr = librosa.load(file)
    tempo, beat = librosa.beat.beat_track(y, sr)
    beats_port.append(beat)

In [13]:
#find minimum length
l = []
idx = []
for mid in beats_port:
    l.append(len(mid))

minimum_beats = min(l)

In [14]:
#crop to minimum
beats_p = []
for item in beats_port:
    beats_p.append(list(item[:minimum_beats]))

In [15]:
#padding as per duration vector
p = minimum_bin - minimum_beats
padding = list(np.zeros(p))
for i in range(len(beats_p)):
    beats_p[i] = beats_p[i]+padding

In [16]:
#create Datapoints:
datasets_port = []
for i in range(len(port_binary)):
    datasets_port.append(np.asarray([port_binary[i],diff_notes2p[i],beats_p[i]]))

# K-Means Clustering

In [17]:
def KMeansClustering(dataset, k, max_itr):
    #create an empty list of centroids
    centroids = []
    #select first 'k' datapoints as the centroids
    for i in range(k):
        centroids.append(dataset[i])
    
    for itr in range(max_itr):
        clusters = []
        #find distance between a data-point and each centroid
        for point in dataset:
            distances = [np.linalg.norm(point-centroid) for centroid in centroids]
            #select centroid at minimum distance
            clusters.append(distances.index(min(distances)))

        #empty dictionary to store data-points as per cluster numbers    
        data = {}
        for i in range(k):
            data[i] = []
        
        #append data-points to the list based on cluster number    
        for i in range(len(dataset)):
            data[clusters[i]].append(dataset[i])
        
        #update centroids based on mean of data-points in the cluster    
        centroids = []
        for point in data:
            centroids.append(np.mean(data[point],axis = 0))
            
    return(clusters)

# Evaluation using Silhouette Analysis

In [18]:
#Evaluation using silhouette analysis
def silhouette_analysis(datasets,k, clusters):
    a = []
    intra_cluster = []
    for i in range(k):
        tmp = []
        #find tunes belonging to a particular cluster
        for j in range(len(datasets)):
            if(i==clusters[j]):
                tmp.append(j)
        intra_cluster.append(tmp)
    
    #find intra-cluster distances of all data-points
    for cluster in intra_cluster:
        for i in range(len(cluster)):
            other_points = cluster[:i]+cluster[i+1:]
            dist = []
            flag = 0
            for point in other_points:
                if(len(other_points)>1):
                    dist.append(np.linalg.norm(np.asarray(datasets[point])-np.asarray(datasets[cluster[i]])))
                else:
                    dist.append(np.mean(datasets[point]))
            a.append(np.mean(dist))
    
    #find inter-cluster distances of all data-points
    inter_clusters = []
    for i in range(len(intra_cluster)):
        inter_clusters.append(list(itertools.chain.from_iterable(intra_cluster[:i]+intra_cluster[i+1:])))
    
    b = []
    for i in range(len(intra_cluster)):
        for j in range(len(inter_clusters)):
            if(i==j):
                for given_point in intra_cluster[i]:
                    dist = []
                    for other_point in inter_clusters[j]:
                        dist.append(np.linalg.norm(np.asarray(datasets[given_point])-np.asarray(datasets[other_point])))
                    b.append(min(dist))
    
    #find silhouette co-efficient of each point
    result = []
    for i in range(len(a)):
        for j in range(len(b)):
            if(i==j):
                res = (b[i]-a[i])/(max(a[i],b[i]))
                if(not math.isnan(res)):
                    result.append(res)
    return(statistics.mean(result))

# Result on Model Evaluation Dataset I using K-Means Clustering

In [19]:
total = 0
for i in range(1):
    clusters = KMeansClustering(datasets_port, 7, 10)
    total+=silhouette_analysis(datasets_port,7, clusters)

In [20]:
print("Result on Port Data using KMeans:",total)

Result on Port Data using KMeans: 0.22460686246807102


# Weighted K-Means Clustering

In [21]:
def WeightedKMeansClustering(dataset, k, max_itr):
    #create an empty list of centroids
    centroids = []
    #select first 'k' datapoints as the centroids
    for i in range(k):
        centroids.append(dataset[i])
    
    for itr in range(max_itr):
        clusters = []
        
        #find distance between a data-point and each centroid
        for point in dataset:
            distances = []
            for centroid in centroids:
                #assign weights to distances
                dist1 = np.linalg.norm(point[0]-centroid[0])*0.01
                dist2 = np.linalg.norm(point[1]-centroid[1])*0.19
                dist3 = np.linalg.norm(point[2]-centroid[2])*0.80
                distances.append(dist1+dist2+dist3)
            #select centroid at minimum distance
            clusters.append(distances.index(min(distances)))

        #empty dictionary to store data-points as per cluster numbers    
        data = {}
        for i in range(k):
            data[i] = []
        
        #append data-points to the list based on cluster number    
        for i in range(len(dataset)):
            data[clusters[i]].append(dataset[i])
        
        #update centroids based on mean of data-points in the cluster    
        centroids = []
        for point in data:
            centroids.append(np.mean(data[point],axis = 0))
            
    return(clusters)

# Result on Model Evaluation Dataset I using Weighted K-Means

In [22]:
total = 0
for i in range(1):
    clusters = WeightedKMeansClustering(datasets_port, 7, 200)
    total+=silhouette_analysis(datasets_port,7, clusters)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [23]:
print("Result on Port Data using Weighted KMeans:",total)

Result on Port Data using Weighted KMeans: 0.21871773421498075


# Data extraction from Model Evaluation Dataset II

In [24]:
#read midis for session
os.getcwd()
files_session = []
os.chdir("/home/girija/Documents/Project_Evaluation/session/")
for root, dirs, files in os.walk("."):
    for file in files:
        files_session.append(mido.MidiFile(file))

In [25]:
#get duration/ note-type
pitch_session = []
duration_session = []
note_session = []
for mid in files_session:
    p = []
    d = []
    n = []
    delta = 0
    for track in mid.tracks:
        for message in track:
            if not isinstance(message, MetaMessage):
                p.append(message.note)
                d.append(message.time)
                n.append(message.type)
    duration_session.append(d)
    note_session.append(n)
    pitch_session.append(p)

In [26]:
#get minimum length
l1 = []
for duration in duration_session:
    size = np.sum(np.asarray(duration))
    l1.append(size)
    
minimum_bin = min(l1)

In [27]:
#get duration in binary
session_binary = []
for item in duration_session:
    tmp = np.zeros(minimum_bin)
    i = 0
    for duration in item:
        if(duration==1 and i<minimum_bin):
                tmp[i] = 1
        else:
            tmp[i+1:duration+1] = 0
        i+=duration
    session_binary.append(list(tmp))

In [28]:
#Find minimum length
l = []
idx = []
for mid in pitch_session:
    l.append(len(mid))

#This will be the final minimum value to which the tunes will be cropped irrespective of their genre.
minimum_pitch = min(l)

In [29]:
#crop the notes to minimum length 
input_data_session = []
for mid in pitch_session:
    input_data_session.append(mid[0:minimum_pitch])

In [30]:
#pitch-vector considering difference with average value
diff_notes2s = []
for item in input_data_session:
    avg = statistics.mean(item)
    diff = []
    for note in item:
        diff.append(note-avg)
    diff_notes2s.append(diff)

In [31]:
#padding as per duration vector
p = minimum_bin - minimum_pitch
padding = list(np.zeros(p))

In [32]:
for i in range(len(diff_notes2s)):
    diff_notes2s[i] = diff_notes2s[i]+padding

In [33]:
#Read all WAV files
os.getcwd()
files_session = []
os.chdir("/home/girija/Documents/Project_Evaluation/session_wav/")
for root, dirs, files in os.walk("."):
    for file in files:
        files_session.append(file)
        
beats_session = []
for file in files_session:
    y, sr = librosa.load(file)
    tempo, beat = librosa.beat.beat_track(y, sr)
    beats_session.append(beat)

In [34]:
#find minimum lengths
l = []
idx = []
for mid in beats_session:
    l.append(len(mid))

minimum_beats = min(l)

In [35]:
#crop to minimum
beats_s = []
for item in beats_session:
    beats_s.append(list(item[:minimum_beats]))

In [36]:
#padding
p = minimum_bin - minimum_beats
padding = list(np.zeros(p))
for i in range(len(beats_s)):
    beats_s[i] = beats_s[i]+padding

In [37]:
#create Datapoints:
datasets_session = []
for i in range(len(session_binary)):
    datasets_session.append(np.asarray([session_binary[i],diff_notes2s[i],beats_s[i]]))

# Result on Model Evaluation Dataset II using K-Means Clustering

In [38]:
total = 0
for i in range(1):
    clusters = KMeansClustering(datasets_session, 7, 100)
    total+=silhouette_analysis(datasets_session,7, clusters)

In [39]:
print("Result on session data using KMeans:",total)

Result on session data using KMeans: 0.2393260143167816


In [40]:
def WeightedKMeansClustering(dataset, k, max_itr):
    #create an empty list of centroids
    centroids = []
    #select first 'k' datapoints as the centroids
    for i in range(k):
        centroids.append(dataset[i])
    
    for itr in range(max_itr):
        clusters = []
        
        #find distance between a data-point and each centroid
        for point in dataset:
            distances = []
            for centroid in centroids:
                #assign weights to distances
                dist1 = np.linalg.norm(point[0]-centroid[0])*0.33
                dist2 = np.linalg.norm(point[1]-centroid[1])*0.33
                dist3 = np.linalg.norm(point[2]-centroid[2])*0.34
                distances.append(dist1+dist2+dist3)
            #select centroid at minimum distance
            clusters.append(distances.index(min(distances)))

        #empty dictionary to store data-points as per cluster numbers    
        data = {}
        for i in range(k):
            data[i] = []
        
        #append data-points to the list based on cluster number    
        for i in range(len(dataset)):
            data[clusters[i]].append(dataset[i])
        
        #update centroids based on mean of data-points in the cluster    
        centroids = []
        for point in data:
            centroids.append(np.mean(data[point],axis = 0))
            
    return(clusters)

# Result on Model Evaluation Dataset II using Weighted K-Means Clustering

In [41]:
total = 0
for i in range(1):
    clusters = WeightedKMeansClustering(datasets_session, 7, 200)
    total+=silhouette_analysis(datasets_session,7, clusters)

In [42]:
print("Result on Session Data using Weighted KMeans:",total)

Result on Session Data using Weighted KMeans: 0.3164296509359607
