<a href="https://colab.research.google.com/github/christoferjulio3/Machine-learning-classification-of-early-twentieth-century-music/blob/main/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Machine Learning Classification of Early Twentieth-Century Composers' Music Styles
---

---




The series of codes below are the process of our Machine Learning Classification using both Random Forest Ensemble and Decision Tree Classifier. 

Data gained from the previous steps are fed into both classifiers - all with 5-fold cross validation. 

For evaluation matrices, both confusion matrix and classification report (from Scikit-learn) are used for understanding the misclassifications between classes.

For visualizing the result, we use SHAP graph for Random Forest Ensemble and graphviz package for Decision Tree.







In [None]:
!pip install shap

In [2]:
from collections import OrderedDict
from sklearn import tree
from sklearn.inspection import permutation_importance
from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics._plot.confusion_matrix import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pydotplus
import graphviz
import shap
import numpy as np
import io
import pandas as pd



In [3]:
#upload data

from google.colab import files
uploaded = files. upload()

Saving augmented_features_minmax.csv to augmented_features_minmax.csv


In [4]:
#load data

df = pd.read_csv(io.StringIO(uploaded['augmented_features_minmax.csv'].decode('utf-8')))

# Remove unused classes

df = df[df['styles'] != 'Late Romantic']
df = df[df['styles'] != 'Modern']

# Randomly sample Nationalism (35% from the all Nationalism's samples)

Nat = df[df['styles'] == 'Nationalism']
df = df[df['styles'] != 'Nationalism']
Nat_random = Nat.sample(frac=0.35)
df = df.append(Nat_random)
df = df.reset_index()

#Change NaN values with zero (if any)

df = df.fillna(0)

#Dividing the data into features and target

data = df.iloc[:, 3:len(df.columns) - 3] # cleaning data

# pitch features

pitch_histogram = data.iloc[:,0:93]
data_pitch= pitch_histogram.iloc[:,88:]

# horizontal intervals and their variations

horizontal_interval = data.iloc[:,180:219]
horizontal_interval_measure = data.iloc[:,232:271]
horizontal_interval_no_skip = data.iloc[:,180:193]
horizontal_interval_1_skip = data.iloc[:,193:206]
horizontal_interval_2_skip = data.iloc[:,206:219]
horizontal_interval_measure_no_skip = data.iloc[:,232:245]
horizontal_interval_measure_1_skip = data.iloc[:,245:258]
horizontal_interval_measure_2_skip = data.iloc[:,258:271]

# vertical intervals

vertical_interval = data.iloc[:,219:232]

# note density

density = data.iloc[:,271:291]

# mean and max pooling of first 8 measures

mean_pool = data.iloc[:,-16:-8]
max_pool = data.iloc[:,-8:]

# classes

target = df['styles']
dataset_1 = data
dataset_2 = pd.concat([data_pitch, horizontal_interval, horizontal_interval_measure], axis=1)
dataset_3 = pd.concat([data_pitch, vertical_interval], axis=1)

# Ordinal Encoding for classes

target, class_name = pd.factorize(target)
target_series = pd.Series(target)

In [None]:
#Decision Tree Classifier with 5-Fold Cross Validation

X = dataset_1
y = target
graphs = []
model = DecisionTreeClassifier(max_depth=3)

accuracy_trainings = []
accuracy_tests = []

predicted_targets = np.array([])
actual_targets = np.array([])

#Training data with Stratified 5-fold Cross Validation

for train_index, test_index in StratifiedKFold().split(X,y):
  
  # separating training and test set
  
  X_train, X_test =  X.iloc[list(train_index)], X.iloc[list(test_index)]
  y_train, y_test = y[train_index], y[test_index]
  
  #training and predict model
  
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)
  
  report = classification_report(y_test, model.predict(X_test), target_names = class_name)
  
  accuracy_trainings.append(accuracy_score(y_train, model.predict(X_train)))
  accuracy_tests.append(accuracy_score(y_test, predictions))
  
  predicted_targets = np.append(predicted_targets, predictions)
  actual_targets = np.append(actual_targets, y_test)

  # Visualize Decision Tree Model of each 5-fold
  
  dot_data = tree.export_graphviz(model, feature_names= X.columns, 
                     class_names = class_name, filled=True,
                     impurity=True)
  graph = graphviz.Source(dot_data)
  graphs.append(graph)

# Plot a tree with the best accuracy 

dicts = {accuracy_tests[i]: graphs[i] for i in range(len(accuracy_tests))}
sorted_dicts = OrderedDict(sorted(dicts.items()))
list(sorted_dicts.values())[-1].render('graph.png', view=True)


# Confusion Matrix and Classification Report (Precise, Recall, f1-score)

cm = confusion_matrix(actual_targets, predicted_targets)
report = classification_report(actual_targets, predicted_targets, target_names =class_name)

# Visualizing all results

print(accuracy_trainings)
print(accuracy_tests)
print('training accuracy = {}'.format(np.average(accuracy_trainings)))
print('test accuracy = {}'.format(np.average(accuracy_tests)))
print(report)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels= class_name) 
fig, ax = plt.subplots(figsize=(7,6))
disp.plot(ax=ax)




In [None]:
# Random Forest Ensemble with 5-fold Cross Validation

X = dataset_1
y = target

model = RandomForestClassifier(n_estimators= 72, min_samples_split= 2, min_samples_leaf= 3, max_features= 'auto', max_depth= 3, bootstrap= True)

accuracy_tests = []
accuracy_trainings= []
predicted_targets = np.array([])
actual_targets = np.array([])

list_shap_values= []
list_test_sets = []

#Training data with Stratified 5-fold Cross Validation

for train_index, test_index in StratifiedKFold(n_splits=5).split(X,y):
  X_train, X_test =  X.iloc[list(train_index)], X.iloc[list(test_index)]
  y_train, y_test = y[train_index], y[test_index]
  
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)

  accuracy_trainings.append(accuracy_score(y_train, model.predict(X_train)))
  accuracy_tests.append(accuracy_score(y_test, predictions))

  predicted_targets = np.append(predicted_targets, predictions)
  actual_targets = np.append(actual_targets, y_test)

  # SHAP Values
  
  explainer = shap.TreeExplainer(model)
  shap_values = explainer.shap_values(X_test)
  list_shap_values.append(shap_values)
  list_test_sets.append(test_index)


#combined confusion matrix and classification report

cm = confusion_matrix(actual_targets, predicted_targets)
report = classification_report(actual_targets, predicted_targets, target_names =class_name)

# combining results and plotting SHAP graph

test_set = list_test_sets[0]
shap_values = np.array(list_shap_values[0])
for i in range(1,len(list_test_sets)):
    test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
    shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
X_test = pd.DataFrame(X.iloc[list(test_set)],columns=X.columns)

# visualizing confusion matrix, accuracy, and classification report

print(accuracy_trainings)
print(accuracy_tests)
print('training accuracy = {}'.format(np.average(accuracy_trainings)))
print('test accuracy = {}'.format(np.average(accuracy_tests)))
print(report)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels= class_name) 
fig, ax = plt.subplots(figsize=(7,6))
disp.plot(ax=ax)


In [None]:
#Plotting SHAP graph

shap.summary_plot(list(shap_values), X_test, class_names=class_name)