# Honduras anaerobic data generation

In [1]:
import pandas as pd
import numpy as np
from scipy import stats


data = pd.read_excel("../coffeeDataSynthesized.xlsx", sheet_name='honduras-anaerobic')

# First 40 rows are measured data
data = data[['width', 'height', 'depth', 'weight']][:40]
# Define numeric columns for analysis
numeric_columns = ['width', 'height', 'depth', 'weight']

# Calculate distribution parameters for each numeric column
distribution_params = {}
for column in numeric_columns:
    # Compute mean and standard deviation
    mean = data[column].mean()
    std = data[column].std()
    
    # Test for normal distribution using the Kolmogorov-Smirnov test
    _, p_value = stats.kstest(data[column], 'norm', args=(mean, std))

    distribution_params[column] = {'mean': mean, 'std': std, 'p_value': p_value}

    print(f"\nDistribution analysis for {column}:")
    print(f"Mean: {mean:.3f}")
    print(f"Standard Deviation: {std:.3f}")
    print(f"p-value for normality test: {p_value:.3f}")


Distribution analysis for width:
Mean: 0.849
Standard Deviation: 0.143
p-value for normality test: 0.112

Distribution analysis for height:
Mean: 1.208
Standard Deviation: 0.195
p-value for normality test: 0.237

Distribution analysis for depth:
Mean: 0.454
Standard Deviation: 0.103
p-value for normality test: 0.205

Distribution analysis for weight:
Mean: 0.150
Standard Deviation: 0.008
p-value for normality test: 0.323


In [2]:
def generate_new_points(data, n_points):
    # Create an empty DataFrame to store the generated points
    new_data = pd.DataFrame()
    np.random.seed(44)

    # Generate new points for numerical columns
    for column in numeric_columns:
        # Retrieve distribution parameters for the current column
        params = distribution_params[column]
        
        if params['p_value'] < 0.05:  # For columns not following a normal distribution
            # Perform empirical resampling (bootstrapping)
            new_values = np.random.choice(data[column], n_points, replace=True)
        else:  # For columns following a normal distribution
            # Sample from a normal distribution using the mean and standard deviation
            new_values = np.random.normal(params['mean'], params['std'], n_points)
            # Limit the values to a reasonable range (within the min and max of the original data)
            new_values = np.clip(new_values, data[column].min(), data[column].max())
        
        # Add the generated values to the new DataFrame
        new_data[column] = new_values

    return new_data

In [4]:
# Generate 90 new synthetic data points
new_points = generate_new_points(data, 90)

# Print the newly generated numeric columns with custom float formatting
print("\nGenerated new points:")
print(new_points[numeric_columns].to_string(float_format=lambda x: '{:.3f}'.format(x).replace('.', ',')))


Generated new points:
    width  height  depth  weight
0   0,741   0,904  0,512   0,164
1   1,038   1,445  0,237   0,144
2   1,027   0,800  0,487   0,149
3   0,619   1,251  0,454   0,146
4   0,638   1,197  0,493   0,152
5   0,603   1,262  0,386   0,157
6   1,115   1,324  0,566   0,137
7   0,861   1,408  0,356   0,156
8   0,841   1,209  0,577   0,149
9   0,928   0,866  0,411   0,159
10  0,711   1,510  0,262   0,153
11  0,823   1,082  0,402   0,137
12  0,679   1,379  0,638   0,140
13  0,936   1,145  0,284   0,160
14  0,712   1,349  0,342   0,157
15  0,901   1,195  0,487   0,148
16  1,001   1,120  0,350   0,155
17  0,832   1,374  0,407   0,145
18  0,967   0,800  0,370   0,158
19  0,675   1,132  0,519   0,148
20  0,678   1,203  0,427   0,143
21  0,880   1,269  0,578   0,151
22  0,818   1,085  0,284   0,148
23  0,646   0,821  0,601   0,154
24  0,779   1,261  0,533   0,146
25  1,021   0,966  0,472   0,149
26  0,748   1,319  0,493   0,165
27  0,852   1,270  0,401   0,158
28  0,793   1,192  0