## Todo
- eda midi file typ[e]
- first 30 seconds?
- add anomaly detection
- try some cross validation or other model effects (SVM)
- more features 
    - get time signature from meta messages
    - stdev of velocity (instead of just average)
    - create some manual cross variables with timing and key and time sig

## Initial Imports and Paths

In [99]:
from composer_class_funcs import *

In [100]:
# viz
import matplotlib.pyplot as plt
import seaborn as sns
# ml packages
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve, roc_curve, auc
from sklearn.preprocessing import LabelEncoder

In [101]:
# file paths
train_midi_path = "./Challenge_DataSet/PS1/"
test_midi_path = "./Challenge_DataSet/PS2/"

print(os.listdir(train_midi_path))
print(os.listdir(test_midi_path))

['.DS_Store', 'Beethoven', 'Schubert', 'Bach', 'Brahms']
['0.8014751784512073_adj.mid', '0.981087291054314_adj.mid', '0.5807329043589801_adj.mid', '0.23120017256495873_adj.mid', '0.033313986422223163_adj.mid', '0.3559970176888735_adj.mid', '0.3264565808984162_adj.mid', '0.905499891236923_adj.mid', '0.10250888365879718_adj.mid', '0.48367685897240176_adj.mid', '0.549470161204349_adj.mid', '0.33695573887576447_adj.mid', '0.7491289879531658_adj.mid', '0.337517805339117_adj.mid', '0.07186746659481313_adj.mid', '0.09167358800381353_adj.mid', '0.539512676743813_adj.mid', '0.26551079719260606_adj.mid', '0.647959423719129_adj.mid', '0.047574444458241216_adj.mid', '0.10222964826466285_adj.mid', '0.22047111832936942_adj.mid', '0.1755252422917658_adj.mid', '0.36321860283443286_adj.mid', '0.21198476749665085_adj.mid', '0.06402123326764841_adj.mid', '0.3620067189216978_adj.mid', '0.9511403301279795_adj.mid', '0.1960551158929671_adj.mid', '0.21901852969811753_adj.mid', '0.10630249969742178_adj.mid', 

## Data Collection and Processing

In [None]:
unlabeled_features = load_dataset(test_midi_path, labeled=False)

In [None]:
features, labels = load_dataset(train_midi_path, labeled=True)

## EDA

In [None]:
df_labeled = create_dataframe(features, labels)
df_labeled.to_csv('eda_df.csv')
df_labeled.head()

In [None]:
df_unlabeled=create_dataframe(unlabeled_features)
df_unlabeled.head()

In [None]:
# Display basic information about the DataFrame
print("\nBasic Information about the DataFrame:")
print(df_labeled.info())

# Generate summary statistics
print("\nSummary Statistics of the DataFrame:")
print(df_labeled.describe())

# Check for missing values
print("\nMissing Values in the DataFrame:")
print(df_labeled.isnull().sum())

In [None]:
df_labeled.key.value_counts()

In [None]:
# Visualize the distribution of each numeric feature
numeric_columns = df_labeled.select_dtypes(include=['float64', 'int64']).columns

plt.figure(figsize=(20, 15))
for i, col in enumerate(numeric_columns):
    plt.subplot(10, 14, i+1)
    sns.histplot(df_labeled[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
# Visualize the distribution of each numeric feature
numeric_columns = df_unlabeled.select_dtypes(include=['float64', 'int64']).columns

plt.figure(figsize=(20, 15))
for i, col in enumerate(numeric_columns):
    plt.subplot(10, 14, i+1)
    sns.histplot(df_unlabeled[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

based on this, I'd remove notes 0-22, 105-127, unless we want to add back in for novelty detection

In [None]:
# Drop columns corresponding to notes 0-22 and 105-127
cols_to_drop = [f'Note_{i}' for i in list(range(0, 23)) + list(range(105, 128))]
df_labeled.drop(columns=cols_to_drop, inplace=True)

numeric_columns = df_labeled.select_dtypes(include=['float64', 'int64']).columns

In [None]:
plt.figure(figsize=(20, 15))
for i, col in enumerate(numeric_columns):
    plt.subplot(10, 14, i+1)
    sns.histplot(df_labeled[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
# Visualize correlations between numeric features
plt.figure(figsize=(12, 10))
correlation_matrix = df_labeled[numeric_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Visualize the relationship between the first two numeric features and the target (if applicable)
if 'composer' in df_labeled.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='composer', y=numeric_columns[0], data=df_labeled, hue='composer')
    plt.title(f'{numeric_columns[0]} by Composer')
    plt.show()

    plt.figure(figsize=(10, 6))
    sns.boxplot(x='composer', y=numeric_columns[1], data=df_labeled, hue='composer')
    plt.title(f'{numeric_columns[1]} by Composer')
    plt.show()

In [None]:
# Prepare the data for the stacked bar chart
key_composer_counts = df_labeled.groupby(['key', 'composer']).size().unstack(fill_value=0)
# Plot the stacked bar chart
key_composer_counts.plot(kind='bar', stacked=True, figsize=(14, 7), colormap='viridis')
plt.title('Number of Songs per Key, Colored by Composer')
plt.xlabel('Key')
plt.ylabel('Number of Songs')
plt.xticks(rotation=90)
plt.legend(title='Composer')
plt.show()


## Clean + Feature engineer

In [None]:
# replace Null keys with 'unk' value
df_labeled['key']=df_labeled['key'].fillna('unk')

In [None]:
# Encode the 'key' variable
label_encoder_key = LabelEncoder()
df_labeled['key_encoded'] = label_encoder_key.fit_transform(df_labeled['key'])

In [None]:
# Encode the 'Composer' column
label_encoder_composer = LabelEncoder()
df_labeled['composer'] = label_encoder_composer.fit_transform(df_labeled['composer'])

In [None]:
# Define the features (X) and target (y)
X = df_labeled.drop(columns=['composer', 'key'])
y = df_labeled['composer']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
print(label_encoder_composer.classes_,'\n')
print('train targets\n',y_train.value_counts())
print('\ntest targets\n',y_test.value_counts())

## Train Classifier

### Logistic Regression

In [None]:
# Train and evaluate the Logistic Regression model
log_reg = LogisticRegression(max_iter=10000, random_state=42)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)
# Get the classification probabilities for each class
y_proba_lr = log_reg.predict_proba(X_test)

In [None]:
print("Logistic Regression:")
print("Accuracy Score:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr,))

# Print the classification probabilities along with the predicted class
print("\nLogistic Regression Classification Probabilities:")
for i, probs in enumerate(y_proba_lr):
    print(f"Sample {i}:")
    for j, class_prob in enumerate(probs):
        print(f"  Class {label_encoder_composer.classes_[j]}: {class_prob:.4f}")
    print(f"  Predicted Class: {label_encoder_composer.inverse_transform([y_pred_lr[i]])[0]}\n")

In [None]:
# Visualize the classification probabilities for each class
class_labels = label_encoder_composer.classes_
num_classes = len(class_labels)

plt.figure(figsize=(14, 10))
for i in range(num_classes):
    plt.subplot(num_classes, 1, i+1)
    sns.histplot(y_proba_lr[:, i], kde=True, bins=20)
    plt.title(f'Class {class_labels[i]}: Probability Distribution')
    plt.xlabel('Predicted Probability')
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
# Plot ROC curves for each class
plt.figure(figsize=(14, 10))
for i in range(num_classes):
    fpr, tpr, thresholds = roc_curve(y_test == i, y_proba_lr[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'Class {class_labels[i]} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2)  # Diagonal line for random guess
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc='best')
plt.show()

### Random Forest

In [None]:
# Build and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [None]:
# Predict the target on the test set
y_pred_rf = rf_classifier.predict(X_test)

# Get the classification probabilities for each class
y_proba_rf = rf_classifier.predict_proba(X_test)

In [None]:
# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

# Print the classification probabilities along with the predicted class
print("\nRF Classification Probabilities:")
for i, probs in enumerate(y_proba_rf):
    print(f"Sample {i}:")
    for j, class_prob in enumerate(probs):
        print(f"  Class {label_encoder_composer.classes_[j]}: {class_prob:.4f}")
    print(f"  Predicted Class: {label_encoder_composer.inverse_transform([y_pred_rf[i]])[0]}\n")

In [None]:
# Visualize the classification probabilities for each class
class_labels = label_encoder_composer.classes_
num_classes = len(class_labels)

plt.figure(figsize=(14, 10))
for i in range(num_classes):
    plt.subplot(num_classes, 1, i+1)
    sns.histplot(y_proba_rf[:, i], kde=True, bins=20)
    plt.title(f'Class {class_labels[i]}: Probability Distribution')
    plt.xlabel('Predicted Probability')
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
# Plot ROC curves for each class
plt.figure(figsize=(14, 10))
for i in range(num_classes):
    fpr, tpr, thresholds = roc_curve(y_test == i, y_proba_rf[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'Class {class_labels[i]} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2)  # Diagonal line for random guess
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc='best')
plt.show()

## Inference

In [None]:
# def predict_composer(file_path, classifier):
#     features = extract_features_from_midi(file_path)
#     prediction = classifier.predict([features])
#     return prediction[0]

# # Example usage
# new_midi_file = 'new_piece.mid'
# composer = predict_composer(new_midi_file, clf)
# print(f"The predicted composer is: {composer}")


# scratch

In [None]:
file_path_ps2 = test_midi_path+"0.981087291054314_adj.mid"
file_path_ps1 = train_midi_path+"Bach/Cello Suite 3_BWV1009_2217_cs3-1pre.mid"

In [102]:
midi = mido.MidiFile(file_path_ps1)

# initialize values
note_counts = [0] * 128  # MIDI notes range from 0 to 127
total_velocity = 0
note_on_count = 0
key = '' # each file should have only 1 key. Investigate if this assumption is correct.
tpb = midi.ticks_per_beat

# get ticks
for track in midi.tracks:
    for msg in track:
        if msg.type == 'note_on' and msg.velocity > 0:
            note_counts[msg.note] += 1
            total_velocity += msg.velocity
            note_on_count += 1
        if msg.is_meta and msg.type == 'key_signature':
            key = msg.key
print(note_counts)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 8, 1, 13, 9, 3, 77, 1, 30, 6, 42, 56, 6, 66, 10, 102, 96, 26, 88, 13, 98, 10, 89, 87, 0, 37, 1, 13, 8, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [105]:
midi = mido.MidiFile(file_path_ps1)

# initialize values
note_counts = [0] * 128  # MIDI notes range from 0 to 127
total_velocity = 0
note_on_count = 0
elapsed_time = 0
key = '' # each file should have only 1 key. Investigate if this assumption is correct.
tpb = midi.ticks_per_beat

# get ticks
# for track in midi.tracks:
for msg in midi:
    # get the key
    if msg.is_meta and msg.type == 'key_signature':
        key = msg.key
    
    # just the first n seconds
    elapsed_time += msg.time
    if elapsed_time<=30:
        if msg.type == 'note_on' and msg.velocity > 0:
            note_counts[msg.note] += 1
            total_velocity += msg.velocity
            note_on_count += 1

print(note_counts)
print(elapsed_time)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 1, 0, 4, 0, 4, 0, 5, 10, 0, 13, 0, 16, 5, 10, 11, 0, 13, 0, 13, 13, 0, 7, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
225.42860899999616


In [None]:
midi = mido.MidiFile(file_path_ps1)
print('Midi file type', midi.type)
print('Length',midi.length)
print('Ticks per beat',midi.ticks_per_beat)

ticks_per_beat = midi.ticks_per_beat

In [None]:
# Load the MIDI file
midi_file = mido.MidiFile(file_path_ps1)

# Initialize a variable to keep track of elapsed time
# elapsed_time = 0

# Define the duration in seconds for which we want to collect messages
target_duration = 30

# List to store the messages within the first 30 seconds
messages_within_duration = []

# Iterate through the messages in the MIDI file
for track in midi_file.tracks:
    elapsed_time = 0
    for message in track:
        elapsed_time += message.time
        if elapsed_time <= target_duration:
            print(elapsed_time)
            print(message)


# Display the collected messages
# for msg in messages_within_duration:
#     print(msg)

In [None]:
tempo=5000
elapsed_time=0

for track in midi.tracks:
    # print(track)
    for msg in track:        
        if msg.type == 'set_tempo':
            tempo = msg.tempo
            print('Tempo:\t',tempo)

        calc_time = mido.tick2second(msg.time, ticks_per_beat, tempo)
        elapsed_time += calc_time
        print('msg.time\t',msg.time)
        print('calc time:\t', calc_time)
        print('Elapsed Time:\t',elapsed_time)        
        # if elapsed_time <= 30:
        #     print(msg.time)

In [None]:
def exp_mido(file_path):
    midi = mido.MidiFile(file_path)
    print('Midi file type', midi.type)
    print('Length',midi.length)
    print('Ticks per beat',midi.ticks_per_beat)

    elapsed_time = 0
    ticks_per_beat = midi.ticks_per_beat
    tempo = 500000

    for msg in midi:
        if msg.type == 'set_tempo':
            tempo = msg.tempo

        elapsed_time += mido.tick2second(msg.time, ticks_per_beat, tempo)
        # print(elapsed_time)
        if elapsed_time <= 30:
            print(msg)

In [None]:
exp_mido(file_path_ps1)

In [None]:
exp_mido(file_path_ps2)

In [None]:
# # Generate the heatmap
# plt.figure(figsize=(12, 8))
# sns.heatmap(composer_avg, cmap='viridis', cbar=True)

# # Display the plot
# plt.title('Composer Note Values Heatmap')
# plt.xlabel('Notes')
# plt.ylabel('Composers')
# plt.show()

In [None]:
# # Transform the DataFrame into a long format
# df_long = pd.melt(composer_avg.reset_index(), id_vars=['Composer'], var_name='Note', value_name='Value')

# # Plot the bar chart
# plt.figure(figsize=(15, 8))
# sns.barplot(x='Note', y='Value', hue='Composer', data=df_long)

# # Customize the plot
# plt.title('Composer Note Values Bar Chart')
# plt.xlabel('Notes')
# plt.ylabel('Values')
# plt.legend(title='Composer')
# plt.xticks(rotation=90)  # Rotate x-axis labels if needed for better readability

# # Display the plot
# plt.show()