# robusta data generation

In [8]:
import pandas as pd
import numpy as np
from scipy import stats

data = pd.read_excel("../coffeeDataSynthesized.xlsx", sheet_name='robusta')

# First 100 rows are measured data
data = data[['width', 'height', 'depth', 'weight']][:100]
# Define the numeric columns
numeric_columns = ['width', 'height', 'depth', 'weight']

# Find distribution parameters for each numerical column
distribution_params = {}
for column in numeric_columns:
    # Calculate mean and standard deviation
    mean = data[column].mean()
    std = data[column].std()
    
    # Perform Kolmogorov-Smirnov test to check normality
    _, p_value = stats.kstest(data[column], 'norm', args=(mean, std))

    # Store results
    distribution_params[column] = {'mean': mean, 'std': std, 'p_value': p_value}

    # Print the results for the current column
    print(f"\nDistribution analysis for {column}:")
    print(f"Mean: {mean:.3f}")
    print(f"Standard Deviation: {std:.3f}")
    print(f"Normality p-value: {p_value:.3f}")


Distribution analysis for width:
Mean: 0.357
Standard Deviation: 0.347
Normality p-value: 0.000

Distribution analysis for height:
Mean: 1.080
Standard Deviation: 1.019
Normality p-value: 0.000

Distribution analysis for depth:
Mean: 0.502
Standard Deviation: 0.090
Normality p-value: 0.036

Distribution analysis for weight:
Mean: 0.139
Standard Deviation: 0.016
Normality p-value: 0.166


In [9]:
def generate_new_points(data, n_points):
    # Create an empty DataFrame to store the generated points
    new_data = pd.DataFrame()
    np.random.seed(44)

    # Generate new points for numerical columns
    for column in numeric_columns:
        # Retrieve distribution parameters for the current column
        params = distribution_params[column]
        
        if params['p_value'] < 0.05:  # For columns not following a normal distribution
            # Perform empirical resampling (bootstrapping)
            new_values = np.random.choice(data[column], n_points, replace=True)
        else:  # For columns following a normal distribution
            # Sample from a normal distribution using the mean and standard deviation
            new_values = np.random.normal(params['mean'], params['std'], n_points)
            # Limit the values to a reasonable range (within the min and max of the original data)
            new_values = np.clip(new_values, data[column].min(), data[column].max())
        
        # Add the generated values to the new DataFrame
        new_data[column] = new_values

    return new_data

In [11]:
# Generate 90 new synthetic data points based on the original dataset
new_points = generate_new_points(data, n_points=180)

# Print the numeric columns of the new dataset with custom float formatting
# Numbers are displayed with commas instead of dots as the decimal separator
print(new_points[numeric_columns].to_string(float_format=lambda x: '{:.3f}'.format(x).replace('.', ',')))

     width  height  depth  weight
0    0,800   1,300  0,425   0,144
1    0,780   0,918  0,400   0,157
2    0,070  11,100  0,550   0,146
3    0,070   1,100  0,450   0,159
4    0,685   0,995  0,500   0,107
5    0,080   0,985  0,420   0,139
6    0,080   1,000  0,315   0,146
7    0,685   1,000  0,550   0,144
8    0,875   1,000  0,600   0,140
9    0,080   1,000  0,400   0,136
10   0,900   0,985  0,450   0,161
11   0,070   1,100  0,420   0,138
12   0,090   0,900  0,500   0,140
13   0,080   1,025  0,500   0,133
14   0,090   1,045  0,400   0,139
15   0,080   0,900  0,450   0,120
16   0,080   1,150  0,645   0,156
17   0,725   0,805  0,550   0,134
18   0,705  11,100  0,600   0,135
19   0,650   1,000  0,350   0,135
20   0,080   1,000  0,510   0,151
21   0,090   0,900  0,500   0,142
22   0,080   0,900  0,500   0,161
23   0,080   1,025  0,550   0,109
24   0,060   0,975  0,500   0,137
25   0,650   0,980  0,535   0,144
26   0,785   0,925  0,500   0,131
27   0,740   1,000  0,395   0,155
28   0,080   0