## Data Preprocessing Method 1: Standardize Y and Z separately
For this method, we will have two CSV files, YantData_padded.csv and ZantData_padded.csv, both with 0 padding. The shape will be 9001x1024 for each CSV file. We will first combine them, then split the two into training and testing sets, then standardize by the column the YantData and the Zantdata separately.

In [1]:
import scipy.io as sio
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
data = np.load('data/large/large_YZant.npz')['arr_0']
data

array([[-3.10114443e+01, -3.10081719e+01, -3.10048994e+01, ...,
         2.89946535e+01,  3.09946529e+01, -7.88138739e-02],
       [-3.10441054e+01, -3.09350764e+01, -3.08260473e+01, ...,
         0.00000000e+00,  0.00000000e+00, -9.74076214e-01],
       [-3.10759402e+01, -3.10541240e+01, -3.10323078e+01, ...,
         0.00000000e+00,  0.00000000e+00, -9.13389276e-01],
       ...,
       [-3.06448456e+01, -2.97020521e+01, -2.87592586e+01, ...,
         0.00000000e+00,  0.00000000e+00, -3.17964143e+04],
       [-2.96620436e+01, -2.76716742e+01, -2.56813047e+01, ...,
         0.00000000e+00,  0.00000000e+00, -2.89042961e+04],
       [-3.10000000e+01, -3.10000000e+01, -3.10000000e+01, ...,
         0.00000000e+00,  0.00000000e+00, -3.08419570e+04]])

## Standardize Y and Z

## NPZ

In [4]:
X = data[:,:-1]
Y = data[:, -1]
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.7, random_state=50)

In [5]:
training = np.hstack((X_train, Y_train.reshape(-1, 1)))
testing = np.hstack((X_test, Y_test.reshape(-1,1)))

In [6]:
np.savez('YZ_Large_training.npz', data = training)
np.savez('YZ_Large_testing.npz', data = testing)

In [8]:
# Load data and concatenate the data with the target
YantData = pd.read_csv('data/csv/YantData_padded.csv', header = None)
ZantData = pd.read_csv('data/csv/ZantData_padded.csv', header = None)
costData = pd.read_csv('data/csv/cost.csv', header = None)
YZ_Labeled_antData = pd.concat([YantData, ZantData, costData], axis = 1)

In [9]:
# split train and test data
X = YZ_Labeled_antData.iloc[:, :-1]
Y = YZ_Labeled_antData.iloc[:, -1]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state= 50)

In [14]:
# Separate the Y and Z ant Data. Y ant will be the first 1024, Z will be the next 1024
# Standardize them separately
Yant_train, Zant_train = X_train.iloc[:, :1024], X_train.iloc[:, 1024:]

# Create a scaler for Y which we can use for the test data set
Yant_scaler = StandardScaler()
Yant_scaler.fit(Yant_train)
scaled_Yant_train = pd.DataFrame(Yant_scaler.transform(Yant_train))

# Create a scaler for Z which we can use for the test data set
Zant_scaler = StandardScaler()
Zant_scaler.fit(Zant_train)
scaled_Zant_train = pd.DataFrame(Zant_scaler.transform(Zant_train))

cost_scaler = StandardScaler()
# Need to reshape Y_train
Y_train = Y_train.values.reshape(-1,1)
cost_scaler.fit(Y_train)
scaled_Cost_train = pd.DataFrame(cost_scaler.transform(Y_train))

In [15]:
# transform the test data based on the scaler of the training
Yant_test, Zant_test = X_test.iloc[:, :1024], X_test.iloc[:, 1024:]

scaled_Yant_test = pd.DataFrame(Yant_scaler.transform(Yant_test))
scaled_Zant_test = pd.DataFrame(Zant_scaler.transform(Zant_test))
Y_test = Y_test.values.reshape(-1,1)
scaled_Cost_test = pd.DataFrame(cost_scaler.transform(Y_test))

In [17]:
# Combine the scaled Yant and Zant training together
scaled_train_combine = pd.concat([scaled_Yant_train, scaled_Zant_train, scaled_Cost_train], axis = 1)

# Combine the scaled Yant and Zant testing together
scaled_test_combine = pd.concat([scaled_Yant_test, scaled_Zant_test, scaled_Cost_test], axis = 1)

In [18]:
# Save both as separate CSV for future use
# the suffix S denotes the method of separately standardizing Y and Z
scaled_train_combine.to_csv('YZantData_train_standardized_S.csv', header = False, index = False)
scaled_test_combine.to_csv('YZantData_test__standardized_S.csv', header = False, index = False)

## Method 2: Standardize Y and Z together

In [20]:
# Load data and concatenate the data with the target
YantData = pd.read_csv('data/csv/YantData_padded.csv', header = None)
ZantData = pd.read_csv('data/csv/ZantData_padded.csv', header = None)
costData = pd.read_csv('data/csv/cost.csv', header = None)
YZ_Labeled_antData = pd.concat([YantData, ZantData, costData], axis = 1)

In [21]:
# split train and test data
X = YZ_Labeled_antData.iloc[:, :-1]
Y = YZ_Labeled_antData.iloc[:, -1]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state= 50)

In [22]:
# Keep the Y and Z ant Data together
YZant_scaler = StandardScaler()
YZant_scaler.fit(X_train)
scaled_YZant_train = pd.DataFrame(YZant_scaler.transform(X_train))

cost_scaler = StandardScaler()
# Need to reshape Y_train
Y_train = Y_train.values.reshape(-1,1)
cost_scaler.fit(Y_train)
scaled_Cost_train = pd.DataFrame(cost_scaler.transform(Y_train))

In [23]:
scaled_YZant_test = pd.DataFrame(YZant_scaler.transform(X_test))
Y_test = Y_test.values.reshape(-1, 1)
scaled_Cost_test = pd.DataFrame(cost_scaler.transform(Y_test))

In [24]:
scaled_train_combine = pd.concat([scaled_YZant_train, scaled_Cost_train], axis = 1)
scaled_test_combine = pd.concat([scaled_YZant_test, scaled_Cost_test], axis = 1)

In [25]:
scaled_train_combine.to_csv('YZantData_train_standardized_T.csv', header = False, index = False)
scaled_test_combine.to_csv('YZantData_test__standardized_T.csv', header = False, index = False)

## Method 3: No standardizing

In [28]:
YantData = pd.read_csv('data/csv/YantData_padded.csv', header = None)
ZantData = pd.read_csv('data/csv/ZantData_padded.csv', header = None)
costData = pd.read_csv('data/csv/cost.csv', header = None)
YZ_Labeled_antData = pd.concat([YantData, ZantData, costData], axis = 1)

In [29]:
# split train and test data
X = YZ_Labeled_antData.iloc[:, :-1]
Y = YZ_Labeled_antData.iloc[:, -1]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state= 50)

In [30]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

YZantData_train = pd.concat([X_train, Y_train], axis = 1)
YZantData_test = pd.concat([X_test, Y_test], axis = 1)

In [34]:
YZantData_train.to_csv('YZantData_train.csv', header = False, index = False)
YZantData_test.to_csv('YZantData_test.csv', header = False, index = False)

## YZ-paired

In [11]:
# Load data and concatenate the data with the target
YantData = pd.read_csv('data/csv/YantData_padded.csv', header = None)
ZantData = pd.read_csv('data/csv/ZantData_padded.csv', header = None)
costData = pd.read_csv('data/csv/cost.csv', header = None)
YZ_Labeled_antData = pd.concat([YantData, ZantData, costData], axis = 1)

In [12]:
# split train and test data
X = YZ_Labeled_antData.iloc[:, :-1]
Y = YZ_Labeled_antData.iloc[:, -1]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state= 50)

In [13]:
# Separate the Y and Z ant Data. Y ant will be the first 1024, Z will be the next 1024
# Standardize them separately
Yant_train, Zant_train = X_train.iloc[:, :1024], X_train.iloc[:, 1024:]

# Create a scaler for Y which we can use for the test data set
Yant_scaler = StandardScaler()
Yant_scaler.fit(Yant_train)
scaled_Yant_train = pd.DataFrame(Yant_scaler.transform(Yant_train))

# Create a scaler for Z which we can use for the test data set
Zant_scaler = StandardScaler()
Zant_scaler.fit(Zant_train)
scaled_Zant_train = pd.DataFrame(Zant_scaler.transform(Zant_train))

cost_scaler = StandardScaler()
# Need to reshape Y_train
Y_train = Y_train.values.reshape(-1,1)
cost_scaler.fit(Y_train)
scaled_Cost_train = pd.DataFrame(cost_scaler.transform(Y_train))

In [14]:
# transform the test data based on the scaler of the training
Yant_test, Zant_test = X_test.iloc[:, :1024], X_test.iloc[:, 1024:]

scaled_Yant_test = pd.DataFrame(Yant_scaler.transform(Yant_test))
scaled_Zant_test = pd.DataFrame(Zant_scaler.transform(Zant_test))
Y_test = Y_test.values.reshape(-1,1)
scaled_Cost_test = pd.DataFrame(cost_scaler.transform(Y_test))

In [46]:
def zigzag_combine(A, B):
    # Check that A and B have the same shape
    features = A.shape[1]
    C = np.vstack((A[:,0], B[:,0])).T
    
    for i in range(1, features):
        y = A[:,i].reshape(-1,1)
        z = B[:,i].reshape(-1,1)
        C = np.hstack((C,y,z))
    return C


In [50]:
finalized_train = pd.DataFrame(zigzag_combine(scaled_Yant_train.values, scaled_Zant_train.values))
finalized_test = pd.DataFrame(zigzag_combine(scaled_Yant_test.values, scaled_Zant_test.values))

In [52]:
YZ_pair_Train = pd.concat([finalized_train, scaled_Cost_train], axis = 1)
YZ_pair_Test = pd.concat([finalized_test, scaled_Cost_test], axis = 1)

In [53]:
YZ_pair_Train.to_csv('YZ_pair_Train.csv', header = False, index = False)
YZ_pair_Test.to_csv('YZ_pair_Test.csv', header = False, index = False)