In [9]:
import numpy as np 
import pandas as pd
dataset = np.load('../..//data/raw/cstr_rawdata.npy')

In [10]:
dataset.shape

(2860, 1404)

In [11]:
dataset.dtype

dtype('float64')

In [13]:
df_head = pd.DataFrame(dataset[:5, :])
df_head.info()
df_head.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Columns: 1404 entries, 0 to 1403
dtypes: float64(1404)
memory usage: 55.0 KB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1394,1395,1396,1397,1398,1399,1400,1401,1402,1403
0,0.999719,0.999854,1.000517,0.999764,0.999982,1.000978,1.000314,0.999953,0.999051,1.000165,...,10.097724,10.042699,9.963228,10.084302,10.54874,10.032248,0.0,0.0,0.0,1.0
1,1.002411,1.00082,1.001,0.999042,0.9992,1.001231,1.00147,1.001782,0.999543,0.999201,...,9.731097,10.080027,9.896257,9.088898,9.798987,10.418807,0.0,0.0,0.0,1.0
2,0.999197,0.998922,1.000095,0.999389,0.999128,0.999978,0.999889,1.001926,1.000072,0.999326,...,10.041097,9.745254,10.877219,9.59261,10.046595,10.262627,0.0,0.0,0.0,1.0
3,1.00077,1.000499,0.99983,1.000854,1.001071,1.001708,0.999969,0.998799,1.000446,0.999382,...,10.57038,9.610376,10.546058,10.679968,10.282365,10.035451,0.0,0.0,0.0,1.0
4,1.001674,1.000058,0.999202,0.998686,0.999539,0.999194,1.000795,1.000007,0.999421,1.000157,...,9.787158,9.954904,10.416254,9.866833,10.053383,10.025643,0.0,0.0,0.0,1.0


In [15]:
X = dataset[:, :1400]
meta = dataset[:, 1400:]   # label, domain, eps, N
print(X.shape, meta.shape)

(2860, 1400) (2860, 4)


In [17]:
import os
sample = dataset[:100, :]   # first 100 rows as a sample
np.save("../..//data/processed/cstr_sample.npy", sample)


In [18]:
import matplotlib.pyplot as plt

X = dataset[:, :1400]      # features
meta = dataset[:, 1400:]   # last 4 cols
y = meta[:, 0].astype(int)      # class label 0–12
domain = meta[:, 1].astype(int) # 0 = source, 1–6 = targets
print(dataset.shape, X.shape, y.shape)


(2860, 1404) (2860, 1400) (2860,)


In [24]:
X = dataset[:, :-4]
class_label = dataset[:, -4]
domain_label = dataset[:, -3]
parameter_noise = dataset[:, -2]
reaction_order = dataset[:, -1]

In [25]:
ind_s = np.where(domain_label == 0)[0]
Xs = X[ind_s]
ys = class_label[ind_s]

In [26]:
counts = pd.Series(ys).value_counts().sort_index()

plt.figure(figsize=(6,4))
counts.plot(kind="bar")
plt.xlabel("Class label (0 = normal, 1–12 = faults)")
plt.ylabel("Count")
plt.title("Class counts across all domains")
plt.tight_layout()
plt.savefig("../..//figures/class_balance.png")
plt.close()


In [27]:
feature0 = Xs[:, 0]

plt.figure(figsize=(6,4))
plt.hist(feature0, bins=40, edgecolor='k')
plt.xlabel("Feature 0 (t=0, var=1)")
plt.ylabel("Frequency")
plt.title("Distribution of a key process variable")
plt.tight_layout()
plt.savefig("../..//figures/feature0_distribution.png")
plt.close()


In [29]:
f0 = Xs[:, 0]   # var1 at t0
f1 = Xs[:, 1]   # var2 at t0

plt.figure(figsize=(5,5))
sc = plt.scatter(f0, f1, c=ys, cmap="tab20", s=8)
plt.xlabel("Feature 0 (t=0)")
plt.ylabel("Feature 1 (t=0)")
plt.title("Relationship between two variables (colored by class)")
plt.tight_layout()
plt.savefig("../..//figures/feature_scatter_by_class.png")
plt.close()


In [31]:
nan_counts = pd.DataFrame(Xs[:, :50]).isna().sum()

plt.figure(figsize=(7,3))
nan_counts.plot(kind="bar")
plt.xlabel("Column index (first 50)")
plt.ylabel("NaN count")
plt.title("Missing values check (none expected)")
plt.tight_layout()
plt.savefig("../..//figures/missing_values_check.png")
plt.close()
