In [126]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import plot_confusion_matrix

from collections import defaultdict

# Data Preparation

In [87]:
df = pd.read_csv('/Users/samuele/University/2.DM2/fma_metadata/echo_no_outlier.csv')
df.head()

Unnamed: 0,track_id,acousticness,danceability,energy,instrumentalness,liveness,speechiness,valence,tempo,duration,bit_rate
0,144,0.909011,0.443643,0.641997,0.924092,0.267669,0.089659,0.788251,128.537,82,256000
1,145,0.235506,0.438672,0.487752,0.716122,0.070359,0.047298,0.650452,120.79,326,256000
2,146,0.532019,0.417681,0.476422,0.4025,0.172105,0.035361,0.682397,135.468,354,256000
3,147,0.77841,0.706681,0.866116,0.806703,0.10465,0.065083,0.917613,120.218,232,256000
4,153,0.988306,0.255661,0.979774,0.973006,0.121342,0.05174,0.034018,90.241,405,256000


In [88]:
genre = pd.read_csv('/Users/samuele/University/2.DM2/fma_metadata/labels_genre_top.csv')
df = df.merge(genre, on='track_id')
df.set_index('track_id', inplace=True)
df.head()

Unnamed: 0_level_0,acousticness,danceability,energy,instrumentalness,liveness,speechiness,valence,tempo,duration,bit_rate,genre_top
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
144,0.909011,0.443643,0.641997,0.924092,0.267669,0.089659,0.788251,128.537,82,256000,Jazz
145,0.235506,0.438672,0.487752,0.716122,0.070359,0.047298,0.650452,120.79,326,256000,Jazz
146,0.532019,0.417681,0.476422,0.4025,0.172105,0.035361,0.682397,135.468,354,256000,Jazz
147,0.77841,0.706681,0.866116,0.806703,0.10465,0.065083,0.917613,120.218,232,256000,Jazz
153,0.988306,0.255661,0.979774,0.973006,0.121342,0.05174,0.034018,90.241,405,256000,Rock


In [77]:
# encoding labels
le = preprocessing.LabelEncoder()
df['genre_top'] = le.fit_transform(df['genre_top'])
print('legend:\n0: Jazz\n1: Rock')
df.genre_top.value_counts()

legend:
0: Jazz
1: Rock


1    3891
0     241
Name: genre_top, dtype: int64

In [89]:
df.head()

Unnamed: 0_level_0,acousticness,danceability,energy,instrumentalness,liveness,speechiness,valence,tempo,duration,bit_rate,genre_top
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
144,0.909011,0.443643,0.641997,0.924092,0.267669,0.089659,0.788251,128.537,82,256000,Jazz
145,0.235506,0.438672,0.487752,0.716122,0.070359,0.047298,0.650452,120.79,326,256000,Jazz
146,0.532019,0.417681,0.476422,0.4025,0.172105,0.035361,0.682397,135.468,354,256000,Jazz
147,0.77841,0.706681,0.866116,0.806703,0.10465,0.065083,0.917613,120.218,232,256000,Jazz
153,0.988306,0.255661,0.979774,0.973006,0.121342,0.05174,0.034018,90.241,405,256000,Rock


# Data Partitioning

In [90]:
attributes = [col for col in df.columns if col != 'genre_top']
X = df[attributes].values
y = df['genre_top']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

In [91]:
df['genre_top'].value_counts()

Rock    3891
Jazz     241
Name: genre_top, dtype: int64

# RIPPER

https://github.com/imoscovitz/wittgenstein

In [92]:
import wittgenstein as lw

In [93]:
ripper_clf = lw.RIPPER()

In [103]:
ripper_clf.fit(X_train, y_train, pos_class='Jazz')

In [131]:
help(ripper_clf)

Help on RIPPER in module wittgenstein.ripper object:

class RIPPER(wittgenstein.abstract_ruleset_classifier.AbstractRulesetClassifier)
 |  RIPPER(k=2, dl_allowance=64, prune_size=0.33, n_discretize_bins=10, max_rules=None, max_rule_conds=None, max_total_conds=None, random_state=None, verbosity=0)
 |  
 |  Class for generating ruleset classification models.
 |  See Cohen (1995): https://www.let.rug.nl/nerbonne/teach/learning/cohen95fast.pdf
 |  
 |  Method resolution order:
 |      RIPPER
 |      wittgenstein.abstract_ruleset_classifier.AbstractRulesetClassifier
 |      abc.ABC
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, k=2, dl_allowance=64, prune_size=0.33, n_discretize_bins=10, max_rules=None, max_rule_conds=None, max_total_conds=None, random_state=None, verbosity=0)
 |      Create a RIPPER classifier.
 |      
 |      Parameters
 |      ----------
 |      k : int, default=2
 |          Number of RIPPERk optimization iterations.
 |      prune_size :

In [144]:
prune = [.33, .5, .75]
ks = [1, 2, 3]

for ps in prune: 
    for num_k in ks: 
        ripper_clf = lw.RIPPER(k=num_k, prune_size=ps)
        ripper_clf.fit(X_train, y_train, pos_class='Jazz')
        print("RIPPER with ", num_k, " ks and ", ps, "prune size \n score: ", ripper_clf.score(X_test, y_test))
        

RIPPER with  1  ks and  0.33 prune size 
 score:  0.9370967741935484
RIPPER with  2  ks and  0.33 prune size 
 score:  0.932258064516129
RIPPER with  3  ks and  0.33 prune size 
 score:  0.9370967741935484
RIPPER with  1  ks and  0.5 prune size 
 score:  0.9370967741935484
RIPPER with  2  ks and  0.5 prune size 
 score:  0.9411290322580645
RIPPER with  3  ks and  0.5 prune size 
 score:  0.9298387096774193
RIPPER with  1  ks and  0.75 prune size 
 score:  0.9362903225806452
RIPPER with  2  ks and  0.75 prune size 
 score:  0.932258064516129
RIPPER with  3  ks and  0.75 prune size 
 score:  0.9362903225806452


In [104]:
ripper_clf

<RIPPER(verbosity=0, k=2, random_state=None, n_discretize_bins=10, max_total_conds=None, max_rule_conds=None, dl_allowance=64, max_rules=None, prune_size=0.33)>

In [106]:
ripper_clf.score(X_test, y_test)

0.9403225806451613

In [145]:
ripper_clf = lw.RIPPER(k=2, prune_size=.5)
ripper_clf.fit(X_train, y_train, pos_class='Jazz', feature_names=attributes)
ripper_clf.out_model()

[[energy=0.0-0.22 ^ duration=314.0-415.0 ^ acousticness=0.93-0.97] V
[energy=0.0-0.22 ^ acousticness=0.97-1.0 ^ bit_rate=256000.0-320000.0 ^ danceability=0.3-0.34] V
[energy=0.0-0.22 ^ acousticness=0.97-1.0 ^ liveness=0.1-0.11 ^ tempo=33.61-85.31] V
[energy=0.0-0.22 ^ valence=0.01-0.07 ^ danceability=0.3-0.34] V
[duration=415.0-2940.0 ^ speechiness=0.11-0.96 ^ instrumentalness=0.28-0.62] V
[energy=0.0-0.22 ^ bit_rate=256000.0-320000.0 ^ acousticness=0.97-1.0 ^ instrumentalness=0.95-0.99] V
[energy=0.0-0.22 ^ instrumentalness=0.89-0.91 ^ tempo=119.13-127.24] V
[duration=415.0-2940.0 ^ energy=0.22-0.39 ^ acousticness=0.97-1.0] V
[energy=0.0-0.22 ^ instrumentalness=0.89-0.91 ^ valence=0.01-0.07] V
[duration=415.0-2940.0 ^ energy=0.39-0.49 ^ valence=0.23-0.31] V
[energy=0.0-0.22 ^ duration=314.0-415.0 ^ tempo=96.37-108.06] V
[duration=415.0-2940.0 ^ energy=0.39-0.49 ^ valence=0.31-0.39]]
