# indian robusta data generation

In [7]:
import pandas as pd
import numpy as np
from scipy import stats


data = pd.read_excel("../coffeeDataSynthesized.xlsx", sheet_name='indian-robusta')

# First 100 rows are measured data
data = data[['width', 'height', 'depth', 'weight']][:100]
# Identify numeric columns
numeric_columns = ['width', 'height', 'depth', 'weight']

# Find distribution parameters for each numeric column
distribution_params = {}
for column in numeric_columns:
    # Calculate mean and standard deviation
    mean = data[column].mean()
    std = data[column].std()
    
    # Test for normality using the Kolmogorov-Smirnov test
    _, p_value = stats.kstest(data[column], 'norm', args=(mean, std))

    # Store the results in a dictionary
    distribution_params[column] = {'mean': mean, 'std': std, 'p_value': p_value}

    # Print distribution analysis for the column
    print(f"\nDistribution analysis for {column}:")
    print(f"Mean: {mean:.3f}")
    print(f"Standard Deviation: {std:.3f}")
    print(f"Normality test p-value: {p_value:.3f}")


Distribution analysis for width:
Mean: 0.788
Standard Deviation: 0.054
Normality test p-value: 0.237

Distribution analysis for height:
Mean: 1.012
Standard Deviation: 0.099
Normality test p-value: 0.713

Distribution analysis for depth:
Mean: 0.439
Standard Deviation: 0.068
Normality test p-value: 0.754

Distribution analysis for weight:
Mean: 0.159
Standard Deviation: 0.020
Normality test p-value: 0.449


In [8]:
def generate_new_points(data, n_points):
    # Create an empty DataFrame to store the generated points
    new_data = pd.DataFrame()
    np.random.seed(44)

    # Generate new points for numerical columns
    for column in numeric_columns:
        # Retrieve distribution parameters for the current column
        params = distribution_params[column]
        
        if params['p_value'] < 0.05:  # For columns not following a normal distribution
            # Perform empirical resampling (bootstrapping)
            new_values = np.random.choice(data[column], n_points, replace=True)
        else:  # For columns following a normal distribution
            # Sample from a normal distribution using the mean and standard deviation
            new_values = np.random.normal(params['mean'], params['std'], n_points)
            # Limit the values to a reasonable range (within the min and max of the original data)
            new_values = np.clip(new_values, data[column].min(), data[column].max())
        
        # Add the generated values to the new DataFrame
        new_data[column] = new_values

    return new_data

In [10]:
# Generate 90 new synthetic data points based on the original dataset
new_points = generate_new_points(data, n_points=180)

# Print the numeric columns of the new dataset with custom float formatting
# Numbers are displayed with commas instead of dots as the decimal separator
print(new_points[numeric_columns].to_string(float_format=lambda x: '{:.3f}'.format(x).replace('.', ',')))

     width  height  depth  weight
0    0,748   1,068  0,438   0,130
1    0,859   0,805  0,376   0,172
2    0,855   1,045  0,600   0,137
3    0,702   1,013  0,419   0,130
4    0,709   1,050  0,568   0,190
5    0,696   0,948  0,488   0,154
6    0,888   1,120  0,473   0,166
7    0,793   0,919  0,394   0,167
8    0,785   1,131  0,452   0,163
9    0,818   0,971  0,388   0,162
10   0,736   0,829  0,492   0,167
11   0,778   0,963  0,432   0,147
12   0,724   1,189  0,443   0,163
13   0,820   0,850  0,593   0,147
14   0,737   0,906  0,373   0,176
15   0,807   1,044  0,356   0,161
16   0,845   0,913  0,410   0,160
17   0,782   0,967  0,481   0,168
18   0,832   0,932  0,556   0,183
19   0,723   1,075  0,412   0,180
20   0,724   0,987  0,503   0,171
21   0,800   1,131  0,394   0,154
22   0,776   0,850  0,440   0,189
23   0,712   1,154  0,315   0,143
24   0,762   1,088  0,345   0,146
25   0,852   1,029  0,503   0,190
26   0,750   1,050  0,515   0,142
27   0,789   0,962  0,507   0,171
28   0,767   0