In [5]:
import json
import pandas as pd
import numpy as np
from pathlib import Path

# Define the structure of the mock database
data_dict = {}

# Configuration for mock data
databases = ['db1', 'db2']
analysis_types = ['analysis1', 'analysis2']
columns_to_analyse = ['column1', 'column2']
agg_functions = ['sum', 'mean']
references = ['ref1', 'ref2']
grouppings = ['all', 'group1', 'group2']
targets = ['RP_total', 'MP']

# Parameters for log-normal distribution
mu_y = 2       # Mean of the underlying normal distribution for 'y'
sigma_y = 0.5  # Standard deviation of the underlying normal distribution for 'y'

mu_var_y = 1.5     # Mean for 'var_y'
sigma_var_y = 0.4  # Standard deviation for 'var_y'

num_points = 100  # Number of data points
num_categories = 3  # Number of categories per group1 and group2

# Function to slightly vary 'y' values
def vary_y(y_values, variation_factor=0.1):
    """
    Slightly varies the y_values by adding a small random noise.
    
    Parameters:
        y_values (np.array): Original y values.
        variation_factor (float): Factor to determine the magnitude of variation.
        
    Returns:
        np.array: Varied y values.
    """
    noise = np.random.normal(loc=0.0, scale=variation_factor, size=y_values.shape)
    return y_values * (1 + noise)

for db in databases:
    data_dict[db] = {}
    for analysis in analysis_types:
        data_dict[db][analysis] = {}
        for column in columns_to_analyse:
            data_dict[db][analysis][column] = {}
            for agg in agg_functions:
                data_dict[db][analysis][column][agg] = {}
                for ref in references:
                    data_dict[db][analysis][column][agg][ref] = {}
                    for group in grouppings:
                        data_dict[db][analysis][column][agg][ref][group] = {}
                        for target in targets:
                            if group == 'all':
                                # Generate log-normal distributed y values
                                y_values = np.random.lognormal(mean=mu_y, sigma=sigma_y, size=num_points)
                                var_y_values = np.random.lognormal(mean=mu_var_y, sigma=sigma_var_y, size=num_points)
                                x_values = np.arange(1, num_points + 1)
                                group_labels = ['all'] * num_points
                                
                                df = pd.DataFrame({
                                    'x': x_values,
                                    'y': y_values,
                                    'var_y': var_y_values,
                                    'group': group_labels
                                })
                                
                                # Generate histogram data for dfh
                                num_bins = 20
                                counts, bin_edges = np.histogram(y_values, bins=num_bins, density=True)
                                bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
                                dfh = pd.DataFrame({
                                    'x': bin_centers,
                                    'y': counts
                                })
                                
                                # Assign DataFrames to the nested dictionary
                                data_dict[db][analysis][column][agg][ref][group][target] = {
                                    'df': df.to_json(orient='split'),
                                    'dfh': dfh.to_json(orient='split')
                                }
                            else:
                                # For group1 and group2, create multiple categories
                                # Start with the 'all' group data
                                y_values_all = np.random.lognormal(mean=mu_y, sigma=sigma_y, size=num_points)
                                var_y_values_all = np.random.lognormal(mean=mu_var_y, sigma=sigma_var_y, size=num_points)
                                x_values_all = np.arange(1, num_points + 1)
                                
                                df_all = pd.DataFrame({
                                    'x': x_values_all,
                                    'y': y_values_all,
                                    'var_y': var_y_values_all,
                                    'group': ['all'] * num_points
                                })
                                
                                # Create multiple categories by varying 'y' values
                                category_dfs = []
                                for cat_num in range(1, num_categories + 1):
                                    df_cat = df_all.copy()
                                    df_cat['y'] = vary_y(df_cat['y'], variation_factor=0.1)  # Vary 'y' by 10%
                                    df_cat['group'] = f'category{cat_num}'
                                    category_dfs.append(df_cat)
                                
                                # Concatenate all categories
                                df_final = pd.concat(category_dfs, ignore_index=True)
                                
                                # Generate histogram data for dfh using the original 'y' values from 'all'
                                counts, bin_edges = np.histogram(y_values_all, bins=num_bins, density=True)
                                bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
                                dfh = pd.DataFrame({
                                    'x': bin_centers,
                                    'y': counts
                                })
                                
                                # Assign DataFrames to the nested dictionary
                                data_dict[db][analysis][column][agg][ref][group][target] = {
                                    'df': df_final.to_json(orient='split'),
                                    'dfh': dfh.to_json(orient='split')
                                }

# Save the mock database to a JSON file
output_path = Path('../data/mock_database.json')
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create directories if they don't exist
with open(output_path, 'w') as f:
    json.dump(data_dict, f, indent=4)

print(f"Mock database generated and saved to {output_path}")


Mock database generated and saved to ../data/mock_database.json


In [8]:
df

Unnamed: 0,x,y,var_y,group
0,1,7.097292,3.441363,category1
1,2,3.988063,3.176856,category1
2,3,5.892178,2.525887,category1
3,4,11.389559,2.885746,category1
4,5,10.502352,1.633054,category1
...,...,...,...,...
295,96,12.869369,3.720633,category3
296,97,4.339172,6.316192,category3
297,98,9.641009,3.842508,category3
298,99,4.124013,2.705399,category3
