# Brezilya pinkstar data generation

In [4]:
import pandas as pd
import numpy as np
from scipy import stats

data = pd.read_excel("../coffeeDataSynthesized.xlsx", sheet_name='brazil-pinkstar')

# First 40 rows are measured data
data = data[['width', 'height', 'depth', 'weight']][:40]
# Define the numeric columns
numeric_columns = ['width', 'height', 'depth', 'weight']

# Analyze distribution parameters for each numeric column
distribution_params = {}
for column in numeric_columns:
    # Calculate mean and standard deviation
    mean = data[column].mean()
    std = data[column].std()
    
    # Test normality using Kolmogorov-Smirnov test
    _, p_value = stats.kstest(data[column], 'norm', args=(mean, std))

    # Store results in a dictionary
    distribution_params[column] = {'mean': mean, 'std': std, 'p_value': p_value}

    # Print results for each column
    print(f"\nDistribution analysis for {column}:")
    print(f"Mean: {mean:.3f}")
    print(f"Standard Deviation: {std:.3f}")
    print(f"Normality test p-value: {p_value:.3f}")



Distribution analysis for width:
Mean: 1.005
Standard Deviation: 0.176
Normality test p-value: 0.032

Distribution analysis for height:
Mean: 1.086
Standard Deviation: 0.132
Normality test p-value: 0.436

Distribution analysis for depth:
Mean: 0.399
Standard Deviation: 0.089
Normality test p-value: 0.005

Distribution analysis for weight:
Mean: 0.156
Standard Deviation: 0.010
Normality test p-value: 0.570


In [5]:
def generate_new_points(data, n_points):
    # Create an empty DataFrame to store the generated points
    new_data = pd.DataFrame()
    np.random.seed(44)

    # Generate new points for numerical columns
    for column in numeric_columns:
        # Retrieve distribution parameters for the current column
        params = distribution_params[column]
        
        if params['p_value'] < 0.05:  # For columns not following a normal distribution
            # Perform empirical resampling (bootstrapping)
            new_values = np.random.choice(data[column], n_points, replace=True)
        else:  # For columns following a normal distribution
            # Sample from a normal distribution using the mean and standard deviation
            new_values = np.random.normal(params['mean'], params['std'], n_points)
            # Limit the values to a reasonable range (within the min and max of the original data)
            new_values = np.clip(new_values, data[column].min(), data[column].max())
        
        # Add the generated values to the new DataFrame
        new_data[column] = new_values

    return new_data

In [7]:
# Generate 90 new synthetic data points based on the original dataset
new_points = generate_new_points(data, n_points=90)

# Print the numeric columns of the new dataset with custom float formatting
# Numbers are displayed with commas instead of dots as the decimal separator
print(new_points[numeric_columns].to_string(float_format=lambda x: '{:.3f}'.format(x).replace('.', ',')))

    width  height  depth  weight
0   0,950   1,350  0,300   0,159
1   0,900   1,267  0,300   0,160
2   0,800   1,226  0,350   0,145
3   0,900   1,276  0,400   0,160
4   0,950   1,093  0,400   0,160
5   0,800   1,098  0,350   0,158
6   0,900   1,023  0,600   0,145
7   0,850   1,126  0,600   0,147
8   0,850   1,233  0,600   0,151
9   0,800   1,084  0,350   0,156
10  1,000   1,086  0,350   0,158
11  0,800   1,002  0,300   0,152
12  0,900   1,296  0,350   0,146
13  0,950   1,089  0,400   0,151
14  0,950   1,160  0,300   0,162
15  1,100   1,060  0,450   0,147
16  1,300   1,147  0,300   0,165
17  0,900   0,847  0,400   0,161
18  0,950   1,104  0,400   0,146
19  0,900   1,005  0,350   0,167
20  0,950   1,258  0,400   0,160
21  1,200   1,188  0,350   0,167
22  0,900   1,049  0,450   0,161
23  1,300   0,991  0,350   0,145
24  0,950   0,962  0,350   0,141
25  0,950   1,160  0,400   0,152
26  1,300   0,995  0,600   0,163
27  1,300   1,310  0,450   0,141
28  0,900   0,948  0,350   0,143
29  1,300 