<span style="color:#333333; font-size:24px; font-weight:bold"> Compiled by <a href=https://github.com/cyterat style="color:#00b2b7;">cyterat</a></span>

# Practical considerations:

- __Data Preparation__: Ensure your data is clean and properly formatted before sampling.

- __Sample Size__: Determine an appropriate sample size based on statistical power calculations or practical constraints.

- __Randomization__: Use numpy's random number generator for consistency and reproducibility.

- __Documentation__: Always document your sampling method and parameters for reproducibility.

- __Validation__: Check if your sample is representative of the population using descriptive statistics.

In [None]:
import numpy as np
import pandas as pd

# 1. Simple Random Sampling

- Randomly sample users for general behavioral analysis.
- Select events for performance debugging or anomaly checks.
- Create a quick baseline dataset without subgroup constraints.

In [None]:
def simple_random_sample(data, sample_size, random_state=42):
    """
    Selects a simple random sample of specified size.
    """
    sample = data.sample(n=sample_size, random_state=random_state)
    return sample.reset_index(drop=True)

# 2. Stratified Sampling

- Ensure equal representation of user tiers (e.g., free vs. paid).
- Maintain proportional device/platform split (e.g., iOS vs. Android).
- Sample feedback forms while preserving demographic diversity.

In [None]:
def stratified_sample(data, strata_col, frac=0.1, random_state=42):
    """
    Performs stratified sampling. Each group defined by `strata_col` will be sampled with the same fraction.
    """
    grouped = data.groupby(strata_col, group_keys=False)

    def sample_group(group):
        return group.sample(frac=frac, random_state=random_state)

    stratified = grouped.apply(sample_group)
    return stratified.reset_index(drop=True)

# 3. Cluster Sampling

- Sample complete user journeys using user_id or session_id.
- Analyze full A/B test groups or experiment variants.
- Select entire support cases or transaction threads for audit..

In [None]:
def cluster_sample(data, cluster_col, n_clusters, random_state=42):
    """
    Samples all data from randomly selected clusters.
    """
    rng = np.random.default_rng(random_state)
    
    # Get unique cluster labels
    clusters = data[cluster_col].unique()
    
    # Randomly select clusters
    selected_clusters = rng.choice(clusters, n_clusters, replace=False)
    
    # Filter data for selected clusters
    cluster_data = data[data[cluster_col].isin(selected_clusters)]
    
    return cluster_data.reset_index(drop=True)

# 4. Systematic Sampling

- Sample every nth event from an event stream for trend monitoring.
- Periodically review logs or metrics from a large dataset.
- Analyze recurring patterns in page view sequences.

In [None]:
def systematic_sample(data, step, start=0):
    """
    Selects every `step`-th row starting from index `start`.
    """
    sample = data.iloc[start::step]
    return sample.reset_index(drop=True)

# 5. Convenience Sampling

- Quickly inspect the latest telemetry events or logs.
- Pull the first 100 rows for schema or transformation testing.
- Prototype analysis without waiting for large data loads.

In [None]:
def convenience_sample(data, sample_size=100):
    """
    Selects the first `sample_size` rows from the dataset.
    """
    sample = data.head(sample_size)
    return sample.reset_index(drop=True)

# 6. Quota Sampling

- Ensure exact counts from each user segment (e.g., 50 paid, 50 free).
- Construct samples with fixed proportions of product categories.
- Balance country-specific quotas for market surveys.

In [None]:
def quota_sample(data, group_col, quotas, random_state=42):
    """
    Samples a fixed number of rows from each group as specified in `quotas`.
    """
    sample = pd.DataFrame()

    for group, quota in quotas.items():
        group_data = data[data[group_col] == group]
        
        # Sample with cap in case the group has fewer rows than the quota
        selected = group_data.sample(
            n=min(len(group_data), quota),
            random_state=random_state
        )
        sample = pd.concat([sample, selected], ignore_index=True)

    return sample.reset_index(drop=True)

# 7. Weighted Sampling

- Oversample high-value customers or heavy users.
- Emphasize rare but important actions (e.g., cancellations).
- Prioritize events with higher business impact for model training.

In [None]:
def weighted_sample(data, weights_col, n, replace=True, random_state=42):
    """
    Samples `n` rows with probabilities defined by `weights_col`.
    """
    sample = data.sample(
        n=n,
        weights=data[weights_col],
        replace=replace,
        random_state=random_state
    )
    return sample.reset_index(drop=True)

# 8. Time-based Sampling

- Extract daily or hourly snapshots from event logs.
- Sample data aligned with marketing campaign periods.
- Analyze rolling user behavior across time windows.

In [None]:
def time_based_sample(data, freq):
    """
    Resamples time-indexed data at the given frequency.
    """
    if not isinstance(data.index, pd.DatetimeIndex):
        raise ValueError("Data must have a DatetimeIndex for time-based sampling.")

    # Sample the first entry in each time bin
    resampled = data.resample(freq).first().dropna()
    
    return resampled.reset_index(drop=True)

# 9. Reservoir Sampling

- Sample from streaming data like live logs or events.
- Maintain a representative subset from a large or unknown-size dataset.
- Create memory-efficient random samples from data pipelines.

In [None]:
def reservoir_sample(iterator, k, random_state=42):
    """
    Selects `k` items from an iterator using reservoir sampling.
    Suitable for streaming or very large datasets.
    """
    rng = np.random.default_rng(random_state)
    reservoir = []

    for i, item in enumerate(iterator):
        if i < k:
            reservoir.append(item)
        else:
            j = rng.integers(0, i + 1)
            if j < k:
                reservoir[j] = item

    return reservoir

# Representativeness Checks

## 1. Descriptive Summary Comparison
Compare the key statistics between the full dataset and the sample.

In [None]:
def compare_descriptive_stats(full, sample, numeric_cols=None):
    """
    Compares mean and std of numeric columns between full dataset and sample.
    """
    if numeric_cols is None:
        numeric_cols = full.select_dtypes(include='number').columns.tolist()

    print("Descriptive Statistics Comparison:\n")
    for col in numeric_cols:
        print(f"--- {col} ---")
        print(f"Full     -> Mean: {full[col].mean():.3f}, Std: {full[col].std():.3f}")
        print(f"Sample   -> Mean: {sample[col].mean():.3f}, Std: {sample[col].std():.3f}\n")

## 2. Group Proportion Comparison
Use this to verify how well your sample maintains proportions of categorical groups (useful for stratified/quota sampling).

In [None]:
def compare_group_proportions(full, sample, group_col):
    """
    Compares the relative frequency of each group in a column between full dataset and sample.
    """
    full_props = full[group_col].value_counts(normalize=True)
    sample_props = sample[group_col].value_counts(normalize=True)

    comparison = pd.concat([full_props, sample_props], axis=1)
    comparison.columns = ['Full Proportion', 'Sample Proportion']
    comparison['Difference'] = (comparison['Sample Proportion'] - comparison['Full Proportion']).abs()

    print(f"Group Proportion Comparison for '{group_col}':\n")
    print(comparison.round(3))

## 3. Distribution Plot Comparison
Use histograms or KDE plots to compare numerical distributions visually.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def compare_distributions(full, sample, numeric_col, bins=30):
    """
    Plots the distribution of a numeric column in full vs. sample.
    """
    plt.figure(figsize=(10, 5))
    sns.histplot(full[numeric_col], color='blue', label='Full', bins=bins, kde=True, stat='density', alpha=0.5)
    sns.histplot(sample[numeric_col], color='orange', label='Sample', bins=bins, kde=True, stat='density', alpha=0.5)
    plt.title(f'Distribution Comparison: {numeric_col}')
    plt.legend()
    plt.xlabel(numeric_col)
    plt.ylabel('Density')
    plt.tight_layout()
    plt.show()

## 4. Sample Integrity Checker
Basic utility to confirm sample shape and NaN counts.

In [None]:
def check_sample_integrity(sample):
    """
    Prints basic info about the sample: shape and missing values.
    """
    print("Sample Shape:", sample.shape)
    print("Missing Values:\n", sample.isnull().sum())

## Usage Example

In [None]:
compare_descriptive_stats(data, sample, numeric_cols=['age', 'income'])
compare_group_proportions(data, sample, group_col='region')
compare_distributions(data, sample, numeric_col='income')
check_sample_integrity(sample)