In [1]:
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [2]:
path="/home/ftamagnan/dataset/"
name="total_metadata_training.npz"
data=dict(np.load(path+name))

data["random"]= np.random.rand(6729,3)


data['vae_embeddings']=data['vae_embeddings'][:,0:32]
for key in data.keys():
    print(data[key].shape,key)

(6729, 3) random
(6729, 12) genre
(6729, 27) velocity_metadata
(6729, 1) bpm
(6729, 32) vae_embeddings
(6729, 9) drums_pitches_used
(6729, 2, 1) fills
(6729, 2) dataset
(6729, 1) offbeat_notes


In [3]:
def feature_selection(scaler=True,cv=True,list_list_label=[],penalty='l2',stats=True):

    
    
    
    for list_label in list_list_label:
            list_x=[]
            for key in data.keys():
                if key in list_label:
                    list_x.append(data[key])
            print(x.shape for x in list_x)
            X=np.concatenate(list_x,axis=1)
            y=data['fills'][:,1].reshape(-1)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
            if scaler:
                scaler = StandardScaler()
                scaler.fit(X_train)
                X_train=scaler.transform(X_train)
                X_test=scaler.transform(X_test)
            
            if cv:
                clf = LogisticRegressionCV(cv=2, random_state=0,
                                   multi_class='ovr',penalty=penalty,solver='liblinear',max_iter=300,n_jobs=-1).fit(X_train, y_train)
            else:
                clf = LogisticRegression(random_state=0,C=100000000).fit(X_train, y_train)

            y_pred=clf.predict(X_test)
            tn, fp, fn, tp=confusion_matrix(y_test, y_pred).ravel()
            if stats:
                print("__________Features used : "+str(list_label)+"_______")
                print("tn,fp,fn,tp = ",tn,fp,fn,tp)
                print("Accuracy = ",(tp+tn)/(tn+fp+fn+tp))
                print("Recall = ",(tp)/(fn+tp))
                print("Precision = ",(tp)/(fp+tp))

    return clf,scaler



# 1.Feature selection

In [4]:
list_list_label=[['vae_embeddings','offbeat_notes','drums_pitches_used','velocity_metadata'],
               ['offbeat_notes','drums_pitches_used','velocity_metadata'],
               ['vae_embeddings'],
               ['offbeat_notes'],
               ['drums_pitches_used'],
               ['velocity_metadata'],
                 ['drums_pitches_used','velocity_metadata'],
                 ['random']
]

clf=feature_selection(scaler=True,cv=True,list_list_label=list_list_label)

<generator object feature_selection.<locals>.<genexpr> at 0x7fb75c410d00>
__________Features used : ['vae_embeddings', 'offbeat_notes', 'drums_pitches_used', 'velocity_metadata']_______
tn,fp,fn,tp =  1578 37 68 336
Accuracy =  0.9479940564635958
Recall =  0.8316831683168316
Precision =  0.900804289544236
<generator object feature_selection.<locals>.<genexpr> at 0x7fb75c410d00>
__________Features used : ['offbeat_notes', 'drums_pitches_used', 'velocity_metadata']_______
tn,fp,fn,tp =  1558 57 72 332
Accuracy =  0.9361069836552749
Recall =  0.8217821782178217
Precision =  0.8534704370179949
<generator object feature_selection.<locals>.<genexpr> at 0x7fb75c410d00>
__________Features used : ['vae_embeddings']_______
tn,fp,fn,tp =  1598 17 381 23
Accuracy =  0.8028727092620109
Recall =  0.05693069306930693
Precision =  0.575
<generator object feature_selection.<locals>.<genexpr> at 0x7fb75c410d00>
__________Features used : ['offbeat_notes']_______
tn,fp,fn,tp =  1614 1 404 0
Accuracy =  0.



In [5]:
list_list_label=[['vae_embeddings','offbeat_notes','velocity_metadata','drums_pitches_used']]
             

clf_l1,scaler=feature_selection(scaler=True,cv=True,list_list_label=list_list_label,penalty='l1',stats=False)
clf_l2,sclaler=feature_selection(scaler=True,cv=True,list_list_label=list_list_label,penalty='l2',stats=False)

<generator object feature_selection.<locals>.<genexpr> at 0x7fb702f1ae08>
<generator object feature_selection.<locals>.<genexpr> at 0x7fb702f1adb0>


In [6]:
name_pitches = ['bass drum','snare drum','closed hi-hat','open hi-hat','low tom','mid tom','high tom','crash cymbal','ride cymbal']
name_features=['max_velocity','std_velocity','mean_velocity']

In [7]:
def stats_weights(clf):
    coef=clf.coef_
    coef=coef.reshape(-1)
    print("------VAE EMBEDDINGS-------")
    print(coef[0:32])

    print("------OFFBEATS NOTES-------")
    print(coef[32])

    print("------VELOCITY METRICS-------")
    
    for j,metric in enumerate(name_features):
        for i,pitch in enumerate(name_pitches):
            print(metric+' of '+pitch,coef[33+i+j])
        
    print("------PITCHES USED-------")
    for i,pitch in enumerate(name_pitches):
        print('use of '+pitch,coef[33+27+i])

# 2. Magnitude of weights with L1 reg

In [8]:
stats_weights(clf_l1)

------VAE EMBEDDINGS-------
[ 2.76942522  1.13851913  0.89440939  1.0427611  -1.89837645  5.10677234
  5.37745336  0.49847288  1.96436244 -1.46824549  0.21731868 -2.01756767
 -0.59754256  0.40211944 -0.89896712 -0.69948948  0.33382477  1.42142568
 -0.49768597  2.3556491   0.21348103 -0.21393681  1.15955125  0.08116335
 -0.0436974  -0.18668932 -1.54592114  0.58301164  0.90139587  0.60025448
  1.05313109 -0.88254269]
------OFFBEATS NOTES-------
-2.253422282489577
------VELOCITY METRICS-------
max_velocity of bass drum -0.15964865707473438
max_velocity of snare drum -1.1766276473693706
max_velocity of closed hi-hat 0.21142807837965952
max_velocity of open hi-hat 0.8215586699084011
max_velocity of low tom 0.005158656662927975
max_velocity of mid tom 1.5139960862663175
max_velocity of high tom -0.09952471331962069
max_velocity of crash cymbal -0.5906503345124842
max_velocity of ride cymbal -0.27326641614583397
std_velocity of bass drum -1.1766276473693706
std_velocity of snare drum 0.211428

# 3.Magnitude of weights with L2 reg

In [9]:
stats_weights(clf_l2)

------VAE EMBEDDINGS-------
[ 2.76618267  1.13964093  0.89643796  1.04281582 -1.89736036  5.09162585
  5.40346353  0.49869041  1.95736242 -1.46735169  0.21726742 -2.01971186
 -0.59548679  0.40230945 -0.89754724 -0.71839864  0.33757268  1.41789822
 -0.49783129  2.35600136  0.21454839 -0.21476633  1.15956504  0.08166183
 -0.03574484 -0.18820595 -1.54330845  0.58282973  0.89939565  0.59948761
  1.05152925 -0.88213021]
------OFFBEATS NOTES-------
-2.2511989864090975
------VELOCITY METRICS-------
max_velocity of bass drum -0.15988603079816505
max_velocity of snare drum -1.1757934957411653
max_velocity of closed hi-hat 0.21102873778572406
max_velocity of open hi-hat 0.8230040893451946
max_velocity of low tom 0.004801844043177905
max_velocity of mid tom 1.5127016317985764
max_velocity of high tom -0.09915170775166497
max_velocity of crash cymbal -0.5907916751734752
max_velocity of ride cymbal -0.2738598279537835
std_velocity of bass drum -1.1757934957411653
std_velocity of snare drum 0.211028

In [10]:
from sklearn.externals import joblib
s = joblib.dump(clf_l2,'clf_fills.pkl')
