In [None]:
import numpy as np

from pygrinder import (
    mcar,
    calc_missing_rate
)

👉 To simulate the real-world data beans with missingness, the ecosystem library PyGrinder, a toolkit helping grind your coffee beans into incomplete ones, is created. Missing patterns fall into three categories according to Robin's theory44: MCAR (missing completely at random), MAR (missing at random), and MNAR (missing not at random). PyGrinder supports all of them and additional functionalities related to missingness. With PyGrinder, you can introduce synthetic missing values into your datasets with a single line of code.

In [4]:
# given a time-series dataset with 128 samples, each sample with 10 time steps and 36 data features
ts_dataset = np.random.randn(128, 10, 36)

# grind the dataset with MCAR pattern, 10% missing probability, and using 0 to fill missing values
X_with_mcar_data = mcar(ts_dataset, p=0.1)

In [8]:
# given a time-series dataset with 128 samples, each sample with 10 time steps and 36 data features
ts_dataset = np.random.randn(128, 36)

# grind the dataset with MCAR pattern, 10% missing probability, and using 0 to fill missing values
X_with_mcar_data = mcar(ts_dataset, p=0.1)

In [9]:
# calculate the missing rate of the dataset
missing_rate = calc_missing_rate(X_with_mcar_data)
missing_rate

np.float64(0.09418402777777778)

# Test my data

In [11]:
import os
import sys

# root path
ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(ROOT)

import polars as pl
import matplotlib.pyplot as plt
from src.config import CONFIG

RAW_DATA_PATH = CONFIG.RAW_DATA_PATH
PRO_DATA_PATH_v1 = CONFIG.PRO_DATA_PATH_v1

df = pl.read_parquet(PRO_DATA_PATH_v1)

In [13]:
X_with_mcar_data = mcar(df.drop("Date Time").to_numpy(), p=0.1)

In [14]:
X_with_mcar_data

array([[ 9.9652e+02, -8.0200e+00,  2.6540e+02, ...,  1.0300e+00,
         1.7500e+00,  1.5230e+02],
       [ 9.9657e+02, -8.4100e+00,  2.6501e+02, ...,  7.2000e-01,
         1.5000e+00,         nan],
       [        nan, -8.5100e+00,  2.6491e+02, ...,  1.9000e-01,
         6.3000e-01,  1.7160e+02],
       ...,
       [ 9.9982e+02, -3.1600e+00,  2.7001e+02, ...,  1.0800e+00,
         2.0000e+00,  2.1520e+02],
       [        nan,         nan,  2.6894e+02, ...,  1.4900e+00,
         2.1600e+00,         nan],
       [ 9.9982e+02, -4.8200e+00,  2.6836e+02, ...,  1.2300e+00,
         1.9600e+00,  1.8490e+02]], shape=(420551, 14), dtype=float32)