In [11]:
import os
import glob
import sys
import random
import tables
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


# Define the base directory for the dataset
msd_subset_data_path = '/Users/finn/Desktop/MMAI_Final/MillionSongSubset/data'
# Define path where the "Getters" file is
msd_code_path='/Users/finn/Desktop/MMAI_Final/PythonSrc'
sys.path.append(msd_code_path)

import hdf5_getters


def store_all_songs(data_path):

    all_file_paths = []
    unstandardized_song_data = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.h5'):
                file_path = os.path.join(root, file)
                try:
                    h5 = hdf5_getters.open_h5_file_read(file_path)  # Use your custom function to open the file
                    artist_familiarity = hdf5_getters.get_artist_familiarity(h5)
                    artist_hotttnesss = hdf5_getters.get_artist_hotttnesss(h5)
                    song_hotttnesss = hdf5_getters.get_song_hotttnesss(h5)
                    loudness = hdf5_getters.get_loudness(h5)
                    year = hdf5_getters.get_year(h5)
                    key = hdf5_getters.get_key(h5)
                    key_confidence = hdf5_getters.get_key_confidence(h5)
                    mode = hdf5_getters.get_mode(h5)
                    mode_confidence = hdf5_getters.get_mode_confidence(h5)
                    time_signature = hdf5_getters.get_time_signature(h5)
                    time_signature_confidence = hdf5_getters.get_time_signature_confidence(h5)
                    tempo = hdf5_getters.get_tempo(h5)
                    confidence_weighted_key = np.zeros(12)  # Assuming there are 12 possible keys
                    confidence_weighted_key[key-1] = key_confidence

                    confidence_weighted_mode = np.zeros(2)  # Assuming mode is binary (0, 1)
                    confidence_weighted_mode[mode] = mode_confidence

                    confidence_weighted_time_signature = np.zeros(16)  # Assuming 0-15 possible time signatures
                    confidence_weighted_time_signature[time_signature-1] = time_signature_confidence

                    
                    
                    if (artist_familiarity != 0 and not np.isnan(artist_familiarity) and
                        artist_hotttnesss != 0 and not np.isnan(artist_hotttnesss) and
                        song_hotttnesss != 0 and not np.isnan(song_hotttnesss) and
                        loudness != 0 and not np.isnan(loudness) and
                        year != 0 and not np.isnan(year) and
                        not np.isnan(key) and
                        key_confidence != 0 and not np.isnan(key_confidence) and
                        not np.isnan(mode) and
                        mode_confidence != 0 and not np.isnan(mode_confidence) and
                        time_signature != 0 and not np.isnan(time_signature) and time_signature <= 16 and 
                        time_signature_confidence != 0 and not np.isnan(time_signature_confidence) and
                        tempo != 0 and not np.isnan(tempo)):
                            all_file_paths.append(file_path)  
                            unstandardized_data_dictionary = {
                                'artist_familiarity': artist_familiarity,
                                'artist_hotttnesss': artist_hotttnesss,
                                'song_hotttnesss': song_hotttnesss,
                                'loudness': loudness,
                                'year': year,
                                **{f'key_{i}': confidence_weighted_key[i] for i in range(12)},
                                **{f'mode_{i}': confidence_weighted_mode[i] for i in range(2)},
                                **{f'time_sig_{i}': confidence_weighted_time_signature[i] for i in range(16)},
                                'tempo': tempo
                            }
                            unstandardized_song_data.append(unstandardized_data_dictionary)
                    h5.close()  # Ensure you close the HDF5 file
                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")
    
    
    df = pd.DataFrame(unstandardized_song_data)
    numeric_fields = ['artist_familiarity', 'artist_hotttnesss', 'song_hotttnesss', 'loudness', 'year', 'tempo']
    means = df[numeric_fields].mean().to_dict()
    std_devs = df[numeric_fields].std().to_dict()
    print(means)
    print('\n')
    print(std_devs)
 
    standardized_song_data = []
    for unstandardized_data_dictionary in unstandardized_song_data:  
        standardized_data_dictionary = {}
        for key, value in unstandardized_data_dictionary.items():  # Nested loop iterates through each key-value pair in the dictionary
            if key in means and key in std_devs and std_devs[key] != 0 and key not in ["mode", "key", "time_signature, year"]:
                standardized_value = (value - means[key]) / std_devs[key]
                standardized_data_dictionary[key] = standardized_value
            else:
                standardized_data_dictionary[key] = value  # Copy over as is for categorical and zero std_dev fields
        standardized_song_data.append(standardized_data_dictionary)   
    
    return all_file_paths, unstandardized_song_data, standardized_song_data

def print_random_song_data(song_data, num_samples=20):
    if len(song_data) < num_samples:
        print("The total number of entries is less than the number of samples requested.")
        num_samples = len(song_data)  # Adjust the number of samples to the size of the dataset if necessary
    
    sample_indices = random.sample(range(len(song_data)), num_samples)
    
    for index in sample_indices:
        print(f"Data for index {index}:")
        for key, value in song_data[index].items():
            if isinstance(value, list) and "confidence_weighted" in key:
                print(f"{key}: {np.array(value)}")  # Print as a formatted array
            else:
                print(f"{key}: {value}")
        print("\n")  # Add a newline for better readability between entries

file_paths, unstandardized_song_data, standardized_song_data = store_all_songs(msd_subset_data_path)

#print_random_song_data(standardized_song_data, num_samples=20)

print(len(file_paths))
print("\n")
print('\nUNSTANDARDIZED DATA: ')
print(len(unstandardized_song_data))
print("\n\n")


print_random_song_data(standardized_song_data)
df = pd.DataFrame(standardized_song_data)
# Split the data
X = df.drop('song_hotttnesss', axis=1)
y = df['song_hotttnesss']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate and print Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Optionally print R-squared to assess the fit
r2 = model.score(X_test, y_test)
print("R-squared:", r2)


{'artist_familiarity': 0.6564103569314292, 'artist_hotttnesss': 0.45928956055725156, 'song_hotttnesss': 0.5068381841687911, 'loudness': -9.071640886699507, 'year': 1999.284236453202, 'tempo': 124.94135073891624}


{'artist_familiarity': 0.12578967354058493, 'artist_hotttnesss': 0.11116850220829746, 'song_hotttnesss': 0.16439692384099608, 'loudness': 4.8428426003792975, 'year': 9.55781854302445, 'tempo': 33.61532371658586}
2030



UNSTANDARDIZED DATA: 
2030



Data for index 1090:
artist_familiarity: -0.540900549282655
artist_hotttnesss: -0.41425112075738285
song_hotttnesss: 0.07251786438436607
loudness: 0.606594335002634
year: 0.5980196758359084
key_0: 0.0
key_1: 0.0
key_2: 0.0
key_3: 0.0
key_4: 0.0
key_5: 0.0
key_6: 0.0
key_7: 0.0
key_8: 0.0
key_9: 0.0
key_10: 0.429
key_11: 0.0
mode_0: 0.413
mode_1: 0.0
time_sig_0: 0.0
time_sig_1: 0.0
time_sig_2: 0.0
time_sig_3: 0.824
time_sig_4: 0.0
time_sig_5: 0.0
time_sig_6: 0.0
time_sig_7: 0.0
time_sig_8: 0.0
time_sig_9: 0.0
time_sig_10: 0.0
time_