In [32]:
import pandas as pd

def get_vpvs_samples(
    vpvs_path,
    sample: int = 100,
    random_state=None,
    std_factor: float = None
    ):
    """
    Sample rows from vpvs file, optionally filtering within mean Â± std_factor * std of v_ij.
    """

    data = pd.read_csv(vpvs_path)

    # Filter within standard deviation range if requested
    if std_factor is not None:
        mean_v = data["v_ij"].mean()
        std_v = data["v_ij"].std()
        lower = mean_v - std_factor * std_v
        upper = mean_v + std_factor * std_v
        data = data[(data["v_ij"] >= lower) & (data["v_ij"] <= upper)]

    # Adjust sample size if fewer rows remain
    if len(data) < sample:
        sample = len(data)

    return data.sample(n=sample, random_state=random_state)

path = "/groups/igonin/ecastillo/CMEZ-SPHighResCatalog/data/vpvs/WB03_20.csv"
iterations = 100
df = pd.read_csv(path)
vpvs_df = get_vpvs_samples(path,sample=iterations)
vpvs_df.describe()

Unnamed: 0,v_ij
count,100.0
mean,2.040841
std,2.272159
min,0.431534
25%,1.440607
50%,1.609675
75%,1.765857
max,18.219118


In [8]:
import pandas as pd

def get_vpvs_samples(
    vpvs_path,
    sample: int = 100,
    random_state=None,
    iqr_factor: float = None
):
    """
    Sample rows from vpvs file, optionally filtering within IQR-based range of v_ij.

    If iqr_factor is provided, values of v_ij outside:
        [Q1 - iqr_factor * IQR,  Q3 + iqr_factor * IQR]
    will be removed.
    """

    data = pd.read_csv(vpvs_path)

    # Filter within IQR range if requested
    if iqr_factor is not None:
        q1 = data["v_ij"].quantile(0.25)
        q3 = data["v_ij"].quantile(0.75)
        iqr = q3 - q1

        lower = q1 - iqr_factor * iqr
        upper = q3 + iqr_factor * iqr

        data = data[(data["v_ij"] >= lower) & (data["v_ij"] <= upper)]

    # Adjust sample size if fewer rows remain
    if len(data) < sample:
        sample = len(data)

    return data.sample(n=sample, random_state=random_state)

path = "/groups/igonin/ecastillo/CMEZ-SPHighResCatalog/data/vpvs/WB03_20.csv"
iterations = 100
df = pd.read_csv(path)
vpvs_df = get_vpvs_samples(path,sample=iterations)
vpvs_df.describe()

Unnamed: 0,v_ij
count,100.0
mean,1.932724
std,2.905277
min,0.292587
25%,1.453072
50%,1.621714
75%,1.771035
max,30.108197


In [33]:
path = "/groups/igonin/ecastillo/CMEZ-SPHighResCatalog/data/vpvs/WB03_20.csv"
df = pd.read_csv(path)
df.describe()

Unnamed: 0,v_ij
count,614331.0
mean,2.529879
std,79.34314
min,1.2e-05
25%,1.470215
50%,1.616239
75%,1.760172
max,38283.296296
