In [2]:
import numpy as np
import pandas as pd
import scipy.io

# Loading .mat data

In [3]:
path = "../data/water_dataset.mat"
mat_data = scipy.io.loadmat(path)
print(mat_data.keys())

dict_keys(['__header__', '__version__', '__globals__', 'X_tr', 'X_te', 'Y_tr', 'Y_te', 'location_group', 'features', 'location_ids'])


# Converting to DataFrame

In [4]:
data_dict = {}

for key in mat_data.keys():
    if key.startswith('__'):
        continue
    
    data_dict[key] = mat_data[key]

In [5]:
X_train = data_dict['X_tr'].squeeze()
X_train = np.stack(X_train, axis=0)
print(X_train.shape)
X_train = X_train.reshape(-1, 11)
print(X_train.shape)

(423, 37, 11)
(15651, 11)


In [6]:
X_test = data_dict['X_te'].squeeze()
X_test = np.stack(X_test, axis=0)
print(X_test.shape)
X_test = X_test.reshape(-1, 11)
print(X_test.shape)

(282, 37, 11)
(10434, 11)


In [7]:
y_train = data_dict['Y_tr']
y_train = y_train.T.reshape(-1)
print(y_train.shape)

(15651,)


In [8]:
y_test = data_dict['Y_te']
y_test = y_test.T.reshape(-1)
print(y_test.shape)

(10434,)


In [9]:
X = np.vstack((X_train, X_test))
y = np.hstack((y_train, y_test))

print(X.shape)
print(y.shape)

(26085, 11)
(26085,)


In [10]:
data_merged = np.hstack((X, y.reshape(-1, 1)))
print(data_merged.shape)

(26085, 12)


In [11]:
features = data_dict['features'].squeeze()
features = np.stack(features, axis=0).squeeze()
print(features.shape)
features = list(features)
features

(11,)


['Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)',
 'pH, water, unfiltered, field, standard units (Maximum)',
 'pH, water, unfiltered, field, standard units (Minimum)',
 'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)',
 'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)',
 'Temperature, water, degrees Celsius (Mean)',
 'Temperature, water, degrees Celsius (Minimum)',
 'Temperature, water, degrees Celsius (Maximum)']

In [12]:
features_short = [
    "Specific conductance (Maximum)",
    "pH, standard units (Maximum)",
    "pH, standard units (Minimum)",
    "Specific conductance (Minimum)",
    "Specific conductance (Mean)",
    "Dissolved oxygen (Maximum)",
    "Dissolved oxygen (Mean)",
    "Dissolved oxygen (Minimum)",
    "Temperature (Mean)",
    "Temperature (Minimum)",
    "Temperature (Maximum)",
    "Target"
]

In [13]:
df = pd.DataFrame(data_merged, columns=features_short)
df.head()

Unnamed: 0,Specific conductance (Maximum),"pH, standard units (Maximum)","pH, standard units (Minimum)",Specific conductance (Minimum),Specific conductance (Mean),Dissolved oxygen (Maximum),Dissolved oxygen (Mean),Dissolved oxygen (Minimum),Temperature (Mean),Temperature (Minimum),Temperature (Maximum),Target
0,0.001131,0.884615,0.00112,0.001113,0.677632,0.841463,0.765152,0.787402,0.29375,0.298077,0.276163,0.648148
1,0.00117,0.871795,0.001159,0.001152,0.703947,0.829268,0.772727,0.795276,0.29375,0.301282,0.276163,0.648148
2,0.001326,0.884615,0.001198,0.00125,0.677632,0.853659,0.75,0.755906,0.3,0.298077,0.287791,0.648148
3,0.014094,0.858974,0.001238,0.003926,0.697368,0.829268,0.772727,0.771654,0.296875,0.294872,0.27907,0.638889
4,0.088109,0.858974,0.010766,0.029297,0.684211,0.853659,0.765152,0.755906,0.296875,0.291667,0.281977,0.648148


In [14]:
df.describe()

Unnamed: 0,Specific conductance (Maximum),"pH, standard units (Maximum)","pH, standard units (Minimum)",Specific conductance (Minimum),Specific conductance (Mean),Dissolved oxygen (Maximum),Dissolved oxygen (Mean),Dissolved oxygen (Minimum),Temperature (Mean),Temperature (Minimum),Temperature (Maximum),Target
count,26085.0,26085.0,26085.0,26085.0,26085.0,26085.0,26085.0,26085.0,26085.0,26085.0,26085.0,26085.0
mean,0.071648,0.887349,0.030986,0.047079,0.550942,0.857237,0.585189,0.557359,0.588324,0.569588,0.581686,0.663508
std,0.168408,0.035386,0.122239,0.136686,0.119689,0.031167,0.146308,0.172836,0.200402,0.208671,0.18654,0.029385
min,0.000526,0.410256,0.000255,0.000508,0.118421,0.719512,0.068182,0.031496,0.059375,0.022436,0.090116,0.574074
25%,0.00191,0.871795,0.001591,0.001777,0.480263,0.841463,0.507576,0.472441,0.421875,0.400641,0.427326,0.648148
50%,0.002632,0.884615,0.00222,0.002422,0.546053,0.853659,0.583333,0.566929,0.6,0.580128,0.59593,0.657407
75%,0.005497,0.910256,0.003713,0.004902,0.631579,0.878049,0.689394,0.677165,0.753125,0.740385,0.738372,0.675926
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
df.shape

(26085, 12)

In [16]:
#df = df.iloc[:5000]
#df.shape

# Save to csv

In [17]:
df.to_csv('../data/water_dataset_reshaped.csv', index=False)