In [18]:
import pandas as pd
import numpy as np

def clean_data(file_path):
    """
    Clean and preprocess the data.

    Parameters:
    - file_path (str): Path to the CSV file containing the data.

    Returns:
    - x (pd.DataFrame): Normalized feature data.
    - y (pd.Series): Encoded labels.
    """
    # Load the data
    df = pd.read_csv(file_path)
    
    # shuffle the data 
    df = df.sample(frac=1).reset_index(drop=True)

    
    # Separate the features and the labels
    y = df['label']
    x = df.drop(['label'], axis=1)

    # Normalize the feature data
    x = (x - x.mean()) / x.std()

    # Ordinal encoding for the label
    unique_labels = y.unique()
    label_dict = {unique_labels[i]: i for i in range(len(unique_labels))}
    y = y.map(label_dict)

    return x, y

# Example usage:
train_path = 'train_data_final.csv'
test_path = 'test_data_final.csv'

x_train, y_train = clean_data(train_path)
x_test, y_test = clean_data(test_path)

print(x_train.head())
print(y_train.head())
print(x_test.head())
print(y_test.head())
# save as npy files
x_train = x_train.to_numpy().astype(np.float32)
y_train = y_train.to_numpy().astype(np.uint)
x_test = x_test.to_numpy().astype(np.float32)
y_test = y_test.to_numpy().astype(np.uint)

np.save('x_train.npy', x_train)
np.save('y_train.npy', y_train)
np.save('x_test.npy', x_test)
np.save('y_test.npy', y_test)

   rmse_mean  rmse_std  spectral_centroid_mean  spectral_centroid_std  \
0  -0.421358 -1.727188               -1.383174              -1.958137   
1   0.656922  0.206084               -0.554160               0.851850   
2  -0.884762  0.291790                0.463550               1.130857   
3   0.850675 -0.000297               -1.346648               0.849105   
4  -0.964743 -0.397042               -1.943950              -0.361052   

   spectral_bandwidth_mean  spectral_bandwidth_std  rolloff_mean  rolloff_std  \
0                -1.398352               -1.459741     -1.567978    -1.617502   
1                 0.088505                1.582472     -0.328791     1.660876   
2                 0.345553                0.288469      0.398264     0.635532   
3                -1.016303                1.865452     -1.326038     1.611944   
4                -2.801442                0.649098     -2.147615    -0.097324   

   zero_crossing_rate_mean  zero_crossing_rate_std  ...  tonnetz4_mean  \


In [16]:
# read npy files
x_train = np.load('x_train.npy')
print(x_train.shape)
print(x_train[0])
print(x_train.dtype)
y_train = np.load('y_train.npy')
print(y_train.shape)
print(y_train[0])
print(y_train.dtype)

(32000, 130)
[-0.67169785 -1.4710041  -1.6788975  -1.8285253  -2.0823963  -1.2661979
 -1.8117235  -1.6022962  -0.7763812  -1.345163   -0.3725419  -1.7207499
  2.2961347  -1.3953819  -1.8297907  -1.5376248   0.5815921  -1.4731673
 -0.05173915 -1.4999644   0.6055949  -1.5274329  -0.8847783  -1.2293793
 -0.7630958  -1.4053953  -0.88005    -1.0974672  -0.73397076 -1.2898788
 -0.9087216  -1.3460696  -0.49065322 -1.3329337  -0.69800794 -1.3977901
 -0.70201427 -1.3737736  -1.1017649  -1.0943662  -0.77564496 -1.2481341
 -0.950419   -1.4649851  -0.858817   -1.3345835  -1.0477504  -1.6132873
 -1.3276291  -1.3775139  -1.532309   -1.4446559  -1.3301272  -0.95007473
 -1.3632712  -1.4945269  -0.68381345 -0.76761985  0.22931328 -1.3598671
  0.11492222 -1.0210652  -1.3722152  -1.2689304  -1.7484659  -1.3355434
 -1.2843469  -0.88452727 -0.8303076  -1.3207625   0.07581349 -0.9651243
  1.356685   -1.0048246   1.0659204  -0.97540593 -1.5958729  -0.8826844
 -2.6410246  -1.1602156  -0.8873808  -1.0441631  -