In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.io

# Loading .mat data

In [20]:
path = "../data/water_dataset.mat"
mat_data = scipy.io.loadmat(path)
print(mat_data.keys())

dict_keys(['__header__', '__version__', '__globals__', 'X_tr', 'X_te', 'Y_tr', 'Y_te', 'location_group', 'features', 'location_ids'])


# Converting to DataFrame

In [21]:
data_dict = {}

for key in mat_data.keys():
    # Skip the metadata keys that aren't actual data (e.g., '__header__', '__version__', etc.)
    if key.startswith('__'):
        continue
    
    # Extract the data for each key (assuming it's an array or matrix)
    data_dict[key] = mat_data[key]

In [22]:
X_train = data_dict['X_tr'].squeeze()
X_train = np.stack(X_train, axis=0)
print(X_train.shape)
X_train = X_train.reshape(-1, 11)
print(X_train.shape)

(423, 37, 11)
(15651, 11)


In [23]:
X_test = data_dict['X_te'].squeeze()
X_test = np.stack(X_test, axis=0)
print(X_test.shape)
X_test = X_test.reshape(-1, 11)
print(X_test.shape)

(282, 37, 11)
(10434, 11)


In [24]:
y_train = data_dict['Y_tr']
y_train = y_train.T.reshape(-1)
print(y_train.shape)

(15651,)


In [25]:
y_test = data_dict['Y_te']
y_test = y_test.T.reshape(-1)
print(y_test.shape)

(10434,)


In [26]:
X = np.vstack((X_train, X_test))
y = np.hstack((y_train, y_test))

print(X.shape)
print(y.shape)

(26085, 11)
(26085,)


In [27]:
features = data_dict['features'].squeeze()
features = np.stack(features, axis=0).squeeze()
print(features.shape)
features = list(features)
features

(11,)


['Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)',
 'pH, water, unfiltered, field, standard units (Maximum)',
 'pH, water, unfiltered, field, standard units (Minimum)',
 'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)',
 'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)',
 'Temperature, water, degrees Celsius (Mean)',
 'Temperature, water, degrees Celsius (Minimum)',
 'Temperature, water, degrees Celsius (Maximum)']

In [28]:
features_short = [
    "Specific conductance (Maximum)",
    "pH, standard units (Maximum)",
    "pH, standard units (Minimum)",
    "Specific conductance (Minimum)",
    "Specific conductance (Mean)",
    "Dissolved oxygen, milligrams per liter (Maximum)",
    "Dissolved oxygen, milligrams per liter (Mean)",
    "Dissolved oxygen, milligrams per liter (Minimum)",
    "Temperature, degrees Celsius (Mean)",
    "Temperature, degrees Celsius (Minimum)",
    "Temperature, degrees Celsius (Maximum)"
]

In [29]:
df = pd.DataFrame(X, columns=features_short)
df.head()

Unnamed: 0,Specific conductance (Maximum),"pH, standard units (Maximum)","pH, standard units (Minimum)",Specific conductance (Minimum),Specific conductance (Mean),"Dissolved oxygen, milligrams per liter (Maximum)","Dissolved oxygen, milligrams per liter (Mean)","Dissolved oxygen, milligrams per liter (Minimum)","Temperature, degrees Celsius (Mean)","Temperature, degrees Celsius (Minimum)","Temperature, degrees Celsius (Maximum)"
0,0.001131,0.884615,0.00112,0.001113,0.677632,0.841463,0.765152,0.787402,0.29375,0.298077,0.276163
1,0.00117,0.871795,0.001159,0.001152,0.703947,0.829268,0.772727,0.795276,0.29375,0.301282,0.276163
2,0.001326,0.884615,0.001198,0.00125,0.677632,0.853659,0.75,0.755906,0.3,0.298077,0.287791
3,0.014094,0.858974,0.001238,0.003926,0.697368,0.829268,0.772727,0.771654,0.296875,0.294872,0.27907
4,0.088109,0.858974,0.010766,0.029297,0.684211,0.853659,0.765152,0.755906,0.296875,0.291667,0.281977


In [30]:
#df = df.iloc[:5000]
#df.shape

(5000, 11)

# Save to csv

In [31]:
#df.to_csv('../data/water_dataset_reshaped.csv')

In [32]:
df.dtypes

Specific conductance (Maximum)                      float64
pH, standard units (Maximum)                        float64
pH, standard units (Minimum)                        float64
Specific conductance (Minimum)                      float64
Specific conductance (Mean)                         float64
Dissolved oxygen, milligrams per liter (Maximum)    float64
Dissolved oxygen, milligrams per liter (Mean)       float64
Dissolved oxygen, milligrams per liter (Minimum)    float64
Temperature, degrees Celsius (Mean)                 float64
Temperature, degrees Celsius (Minimum)              float64
Temperature, degrees Celsius (Maximum)              float64
dtype: object