In [4]:
import pandas as pd
import numpy as np

def clean_data(file_path):
    """
    Clean and preprocess the data.

    Parameters:
    - file_path (str): Path to the CSV file containing the data.

    Returns:
    - x (pd.DataFrame): Normalized feature data.
    - y (pd.Series): Encoded labels.
    """
    # Load the data
    df = pd.read_csv(file_path)
    
    # shuffle the data 
    df = df.sample(frac=1).reset_index(drop=True)

    
    # Separate the features and the labels
    y = df['label']
    x = df.drop(['label'], axis=1)

    # Normalize the feature data
    x = (x - x.mean()) / x.std()
    print(x.iloc[0])
    # Ordinal encoding for the label
    unique_labels = y.unique()
    label_dict = {unique_labels[i]: i for i in range(len(unique_labels))}
    y = y.map(label_dict)

    return x, y

# Example usage:
train_path = 'train_data_final.csv'
test_path = 'test_data_final.csv'

x_train, y_train = clean_data(train_path)
x_test, y_test = clean_data(test_path)

print(x_train.head())
print(y_train.head())
print(x_test.head())
print(y_test.head())
# save as npy files
x_train = x_train.to_numpy().astype(np.float32, order='C')
y_train = y_train.to_numpy().astype(np.uint, order='C')
x_test = x_test.to_numpy().astype(np.float32, order='C')
y_test = y_test.to_numpy().astype(np.uint, order='C')

np.save('x_train.npy', x_train)
np.save('y_train.npy', y_train)
np.save('x_test.npy', x_test)
np.save('y_test.npy', y_test)

rmse_mean                  1.280204
rmse_std                   0.193029
spectral_centroid_mean     0.148130
spectral_centroid_std     -0.918708
spectral_bandwidth_mean    0.399287
                             ...   
tonnetz6_std              -0.690576
chroma_cqt_mean           -1.533530
chroma_cqt_std             0.192809
spectral_contrast_mean    -0.711459
spectral_contrast_std     -0.227020
Name: 0, Length: 130, dtype: float64
rmse_mean                 -0.419098
rmse_std                  -0.958877
spectral_centroid_mean     0.788694
spectral_centroid_std     -0.227098
spectral_bandwidth_mean    0.920139
                             ...   
tonnetz6_std              -0.563680
chroma_cqt_mean            1.908725
chroma_cqt_std            -1.546972
spectral_contrast_mean    -1.268329
spectral_contrast_std      0.198919
Name: 0, Length: 130, dtype: float64
   rmse_mean  rmse_std  spectral_centroid_mean  spectral_centroid_std  \
0   1.280204  0.193029                0.148130              -

In [5]:
# read npy files
x_train = np.load('x_train.npy')
print(x_train.shape)
print(x_train[0])
print(x_train.dtype)
y_train = np.load('y_train.npy')
print(y_train.shape)
print(y_train[0])
print(y_train.dtype)

(32000, 130)
[ 1.2802037   0.1930291   0.14813042 -0.9187077   0.39928705 -1.1263012
  0.3371404  -0.8981565  -0.45451304  0.0621605   1.0560607  -1.3348378
 -0.0733905  -1.044068   -0.19040577 -1.309372   -0.5134216  -0.8083802
 -0.9226823  -0.82346004 -0.630479   -0.8292406  -0.30959445 -0.6063041
  0.47326967 -1.1096132   0.93647605 -1.3001195   1.1064268  -0.97503984
  0.99309564 -0.4860793   0.75378454 -0.3216847   0.3706675  -0.6359648
  0.44073683 -0.95372987  0.02045424 -1.0794158   0.15339833 -1.1014866
  0.10339521 -1.010531    0.25264156 -0.9992745   0.01791598 -0.4988723
  0.47676328 -0.42679846  0.46276245 -0.16447748  0.47393793 -0.5530521
  0.57042205 -0.43238965  0.198013   -0.4687996  -0.10174983 -0.5659673
  0.04667875 -0.5686789   0.12115318 -0.81296015  0.39833286 -0.2668084
  0.5142624  -0.7741534   0.30212298 -0.25643107  0.5739981  -0.5504623
  0.9862857  -0.1276196   0.9586745  -0.5532855   0.33066446 -0.7535795
 -0.4247169  -1.0487133  -0.3856089  -1.069695   -