In [1]:
import pandas as pd
import numpy as np

def clean_data(file_path):
    """
    Clean and preprocess the data.

    Parameters:
    - file_path (str): Path to the CSV file containing the data.

    Returns:
    - x (pd.DataFrame): Normalized feature data.
    - y (pd.Series): Encoded labels.
    """
    # Load the data
    df = pd.read_csv(file_path)
    
    # shuffle the data 
    df = df.sample(frac=1).reset_index(drop=True)

    
    # Separate the features and the labels
    y = df['label']
    x = df.drop(['label'], axis=1)

    # Normalize the feature data
    x = (x - x.mean()) / x.std()
    print(x.iloc[0])
    # Ordinal encoding for the label
    unique_labels = y.unique()
    label_dict = {unique_labels[i]: i for i in range(len(unique_labels))}
    y = y.map(label_dict)

    return x, y

# Example usage:
train_path = 'train_data_final.csv'
test_path = 'test_data_final.csv'

x_train, y_train = clean_data(train_path)
x_test, y_test = clean_data(test_path)

print(x_train.head())
print(y_train.head())
print(x_test.head())
print(y_test.head())
# save as npy files
x_train = x_train.to_numpy().astype(np.float32, order='C')
y_train = y_train.to_numpy().astype(np.uint, order='C')
x_test = x_test.to_numpy().astype(np.float32, order='C')
y_test = y_test.to_numpy().astype(np.uint, order='C')

np.save('x_train.npy', x_train)
np.save('y_train.npy', y_train)
np.save('x_test.npy', x_test)
np.save('y_test.npy', y_test)

rmse_mean                  0.781981
rmse_std                   0.478631
spectral_centroid_mean     0.765407
spectral_centroid_std      0.052892
spectral_bandwidth_mean    0.815995
                             ...   
tonnetz6_std               1.127270
chroma_cqt_mean           -1.451831
chroma_cqt_std             0.319245
spectral_contrast_mean    -0.253118
spectral_contrast_std     -0.156484
Name: 0, Length: 130, dtype: float64
rmse_mean                 -0.295700
rmse_std                   1.596307
spectral_centroid_mean     0.252214
spectral_centroid_std      0.903036
spectral_bandwidth_mean   -0.127028
                             ...   
tonnetz6_std               0.237541
chroma_cqt_mean           -1.500175
chroma_cqt_std             0.605482
spectral_contrast_mean     0.417791
spectral_contrast_std      0.938213
Name: 0, Length: 130, dtype: float64
   rmse_mean  rmse_std  spectral_centroid_mean  spectral_centroid_std  \
0   0.781981  0.478631                0.765407               

In [2]:
# read npy files
x_train = np.load('x_train.npy')
print(x_train.shape)
print(x_train[0])
print(x_train.dtype)
y_train = np.load('y_train.npy')
print(y_train.shape)
print(y_train[0])
print(y_train.dtype)

(32000, 130)
[ 0.78198105  0.4786313   0.7654065   0.05289226  0.815995   -0.4903428
  0.85992265 -0.38888046  0.33683458  0.6005894   0.7482788   0.03575227
 -0.79637736 -0.0155991  -0.22231244  0.4897355  -1.0421329   1.6309216
 -0.43924093  1.6189176  -0.40740597  0.714283   -0.06757775  0.37893268
 -0.4006397   0.89765775  0.4860978   1.3472273   0.6431448   2.7630956
  0.99903077  2.07335     0.66787297  2.2133605   0.8526972   1.7424986
  1.2077185   1.5831676   0.40530562  0.49392974  0.45755413  0.13919334
  0.8956567   0.4772323   1.7627006   0.0092548   2.1227484   0.88276154
  1.4346887   0.763561    0.9351623   1.0000918   0.952556    1.3956966
  1.0203601   0.51999074  1.0056168  -0.08257153  0.601611    0.31520164
  0.03134469  0.14307106  0.27922565  0.2440321   0.20253976  0.29074228
  0.4425657   0.09714236  1.023957   -0.05920232  0.55985785 -0.42592892
  0.80707586 -0.59991336  0.81369597 -0.13344465  0.8622315   0.03539791
  1.2062345   0.7327044   1.5402439   0.927

: 