In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.io

# Loading .mat data

In [3]:
path = "../data/water_dataset.mat"
mat_data = scipy.io.loadmat(path)
print(mat_data.keys())

dict_keys(['__header__', '__version__', '__globals__', 'X_tr', 'X_te', 'Y_tr', 'Y_te', 'location_group', 'features', 'location_ids'])


# Converting to DataFrame

In [None]:
data_dict = {}

for key in mat_data.keys():
    if key.startswith('__'):
        continue
    
    data_dict[key] = mat_data[key]

In [5]:
X_train = data_dict['X_tr'].squeeze()
X_train = np.stack(X_train, axis=0)
print(X_train.shape)
X_train = X_train.reshape(-1, 11)
print(X_train.shape)

(423, 37, 11)
(15651, 11)


In [6]:
X_test = data_dict['X_te'].squeeze()
X_test = np.stack(X_test, axis=0)
print(X_test.shape)
X_test = X_test.reshape(-1, 11)
print(X_test.shape)

(282, 37, 11)
(10434, 11)


In [7]:
y_train = data_dict['Y_tr']
y_train = y_train.T.reshape(-1)
print(y_train.shape)

(15651,)


In [8]:
y_test = data_dict['Y_te']
y_test = y_test.T.reshape(-1)
print(y_test.shape)

(10434,)


In [9]:
X = np.vstack((X_train, X_test))
y = np.hstack((y_train, y_test))

print(X.shape)
print(y.shape)

(26085, 11)
(26085,)


In [13]:
data_merged = np.hstack((X, y.reshape(-1, 1)))
print(data_merged.shape)

(26085, 12)


In [14]:
features = data_dict['features'].squeeze()
features = np.stack(features, axis=0).squeeze()
print(features.shape)
features = list(features)
features

(11,)


['Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)',
 'pH, water, unfiltered, field, standard units (Maximum)',
 'pH, water, unfiltered, field, standard units (Minimum)',
 'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)',
 'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)',
 'Temperature, water, degrees Celsius (Mean)',
 'Temperature, water, degrees Celsius (Minimum)',
 'Temperature, water, degrees Celsius (Maximum)']

In [15]:
features_short = [
    "Specific conductance (Maximum)",
    "pH, standard units (Maximum)",
    "pH, standard units (Minimum)",
    "Specific conductance (Minimum)",
    "Specific conductance (Mean)",
    "Dissolved oxygen, milligrams per liter (Maximum)",
    "Dissolved oxygen, milligrams per liter (Mean)",
    "Dissolved oxygen, milligrams per liter (Minimum)",
    "Temperature, degrees Celsius (Mean)",
    "Temperature, degrees Celsius (Minimum)",
    "Temperature, degrees Celsius (Maximum)",
    "Target"
]

In [16]:
df = pd.DataFrame(data_merged, columns=features_short)
df.head()

Unnamed: 0,Specific conductance (Maximum),"pH, standard units (Maximum)","pH, standard units (Minimum)",Specific conductance (Minimum),Specific conductance (Mean),"Dissolved oxygen, milligrams per liter (Maximum)","Dissolved oxygen, milligrams per liter (Mean)","Dissolved oxygen, milligrams per liter (Minimum)","Temperature, degrees Celsius (Mean)","Temperature, degrees Celsius (Minimum)","Temperature, degrees Celsius (Maximum)",Target
0,0.001131,0.884615,0.00112,0.001113,0.677632,0.841463,0.765152,0.787402,0.29375,0.298077,0.276163,0.648148
1,0.00117,0.871795,0.001159,0.001152,0.703947,0.829268,0.772727,0.795276,0.29375,0.301282,0.276163,0.648148
2,0.001326,0.884615,0.001198,0.00125,0.677632,0.853659,0.75,0.755906,0.3,0.298077,0.287791,0.648148
3,0.014094,0.858974,0.001238,0.003926,0.697368,0.829268,0.772727,0.771654,0.296875,0.294872,0.27907,0.638889
4,0.088109,0.858974,0.010766,0.029297,0.684211,0.853659,0.765152,0.755906,0.296875,0.291667,0.281977,0.648148


In [24]:
df.describe()

Unnamed: 0,Specific conductance (Maximum),"pH, standard units (Maximum)","pH, standard units (Minimum)",Specific conductance (Minimum),Specific conductance (Mean),"Dissolved oxygen, milligrams per liter (Maximum)","Dissolved oxygen, milligrams per liter (Mean)","Dissolved oxygen, milligrams per liter (Minimum)","Temperature, degrees Celsius (Mean)","Temperature, degrees Celsius (Minimum)","Temperature, degrees Celsius (Maximum)",Target
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,0.042063,0.885954,0.019912,0.027874,0.595538,0.85428,0.643294,0.623628,0.516284,0.491627,0.519024,0.661537
std,0.11965,0.033377,0.091304,0.099606,0.103284,0.030586,0.12178,0.140951,0.158054,0.164508,0.148592,0.029496
min,0.000526,0.705128,0.000432,0.000508,0.236842,0.768293,0.212121,0.125984,0.1125,0.067308,0.145349,0.592593
25%,0.001852,0.871795,0.001513,0.001699,0.526316,0.829268,0.568182,0.551181,0.4,0.36859,0.409884,0.638889
50%,0.002515,0.884615,0.002063,0.002344,0.592105,0.853659,0.643939,0.629921,0.528125,0.503205,0.531977,0.657407
75%,0.004873,0.910256,0.002986,0.004063,0.671053,0.878049,0.727273,0.716535,0.628125,0.608974,0.627907,0.675926
max,0.814815,0.974359,0.768173,0.78125,0.953947,0.97561,0.939394,0.929134,0.94375,0.929487,0.953488,0.962963


In [20]:
df.shape

(26085, 12)

In [21]:
df = df.iloc[:5000]
df.shape

(5000, 12)

# Save to csv

In [None]:
#df.to_csv('../data/water_dataset_reshaped.csv')