In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd

# Define the structure of the mock database
data_dict = {}

# Configuration for mock data
databases = ['db1', 'db2']
analysis_types = ['analysis1', 'analysis2']
columns_to_analyse = ['column1', 'column2']
agg_functions = ['sum', 'mean']
references = ['ref1', 'ref2']
grouppings = ['all', 'group1', 'group2']
targets = ['RP_total', 'MP']

# Parameters for log-normal distribution
mu_y = 2       # Mean of the underlying normal distribution for 'y'
sigma_y = 0.5  # Standard deviation of the underlying normal distribution for 'y'

mu_var_y = 1.5     # Mean for 'var_y'
sigma_var_y = 0.4  # Standard deviation for 'var_y'

num_points = 100  # Number of data points
num_categories = 3  # Number of categories per group1 and group2

# Function to slightly vary 'y' values
def vary_y(y_values, variation_factor=0.1):
    """
    Slightly varies the y_values by adding a small random noise.
    """
    noise = np.random.normal(loc=0.0, scale=variation_factor, size=y_values.shape)
    return y_values * (1 + noise)

for db in databases:
    data_dict[db] = {}
    for analysis in analysis_types:
        data_dict[db][analysis] = {}
        for column in columns_to_analyse:
            data_dict[db][analysis][column] = {}
            for agg in agg_functions:
                data_dict[db][analysis][column][agg] = {}
                for ref in references:
                    data_dict[db][analysis][column][agg][ref] = {}
                    for group in grouppings:
                        data_dict[db][analysis][column][agg][ref][group] = {}
                        for target in targets:
                            if group == 'all':
                                # Generate log-normal distributed y values
                                y_values = np.random.lognormal(mean=mu_y, sigma=sigma_y, size=num_points)
                                var_y_values = np.random.lognormal(mean=mu_var_y, sigma=sigma_var_y, size=num_points)
                                x_values = np.arange(1, num_points + 1)
                                group_labels = ['all'] * num_points

                                df = pd.DataFrame({
                                    'x': x_values,
                                    'y': y_values,
                                    'var_y': var_y_values,
                                    'group': group_labels
                                })

                                # dfh should have the same x as df
                                # We'll create a "histogram-like" distribution by just generating random y-values
                                dfh = pd.DataFrame({
                                    'x': df['x'],
                                    'y': np.random.rand(num_points) * np.mean(y_values)  # Random "histogram-like" values
                                })

                                # dfhg: columns x, y, group
                                # Same groups as df (just "all" in this case), and random histogram per group
                                dfhg = pd.DataFrame({
                                    'x': df['x'],
                                    'group': df['group']
                                })
                                dfhg['y'] = np.random.rand(num_points) * np.mean(y_values)  # random histogram-like values per group

                                # Assign DataFrames to the nested dictionary
                                data_dict[db][analysis][column][agg][ref][group][target] = {
                                    'df': df.to_json(orient='split'),
                                    'dfh': dfh.to_json(orient='split'),
                                    'dfhg': dfhg.to_json(orient='split')
                                }
                            else:
                                # For group1 and group2, create multiple categories
                                # Start with the 'all' group data
                                y_values_all = np.random.lognormal(mean=mu_y, sigma=sigma_y, size=num_points)
                                var_y_values_all = np.random.lognormal(mean=mu_var_y, sigma=sigma_var_y, size=num_points)
                                x_values_all = np.arange(1, num_points + 1)

                                df_all = pd.DataFrame({
                                    'x': x_values_all,
                                    'y': y_values_all,
                                    'var_y': var_y_values_all,
                                    'group': ['all'] * num_points
                                })

                                # Create multiple categories by varying 'y' values
                                category_dfs = []
                                categories = []
                                for cat_num in range(1, num_categories + 1):
                                    df_cat = df_all.copy()
                                    df_cat['y'] = vary_y(df_cat['y'], variation_factor=0.1)  # Vary 'y' by 10%
                                    df_cat['group'] = f'category{cat_num}'
                                    category_dfs.append(df_cat)
                                    categories.append(f'category{cat_num}')

                                # Concatenate all categories
                                df_final = pd.concat(category_dfs, ignore_index=True)

                                # dfh should have same x as df (here we have multiple categories but we can use the same approach)
                                # Let's just take unique x from df_final (they are all the same)
                                unique_x = df_final['x'].unique()
                                dfh = pd.DataFrame({
                                    'x': unique_x,
                                    'y': np.random.rand(len(unique_x)) * np.mean(y_values_all)  # Random histogram-like values
                                })

                                # dfhg: we want columns x, y, group with same groups as df_final, and random hist per group
                                dfhg_list = []
                                for grp in df_final['group'].unique():
                                    grp_df = pd.DataFrame({
                                        'x': unique_x,
                                        'group': grp,
                                        'y': np.random.rand(len(unique_x)) * np.mean(y_values_all)
                                    })
                                    dfhg_list.append(grp_df)
                                dfhg = pd.concat(dfhg_list, ignore_index=True)

                                # Assign DataFrames to the nested dictionary
                                data_dict[db][analysis][column][agg][ref][group][target] = {
                                    'df': df_final.to_json(orient='split'),
                                    'dfh': dfh.to_json(orient='split'),
                                    'dfhg': dfhg.to_json(orient='split')
                                }

# Save the mock database to a JSON file
output_path = Path('../data/mock_database_new.json')
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create directories if they don't exist
with open(output_path, 'w') as f:
    json.dump(data_dict, f, indent=4)

print(f"Mock database generated and saved to {output_path}")


Mock database generated and saved to ../data/mock_database_new.json


In [2]:
dfhg

Unnamed: 0,x,group,y
0,1,category1,3.566541
1,2,category1,8.375816
2,3,category1,0.359563
3,4,category1,7.846924
4,5,category1,6.958379
...,...,...,...
295,96,category3,1.703621
296,97,category3,2.888706
297,98,category3,1.135730
298,99,category3,2.458625
