### Introduction

This notebook sets out to produce distributions that are randomly sampled from the space of
distributions which entropy is set to a fixed percentage of the maximum entropy

In [None]:
import os
import numpy as np
from scipy import stats
import pickle
import probability_distributions
import limited_entropy_distributions as limited_entropy

In [None]:
PATH = "/home/joboti/azumi_derkjan/master_thesis/code/"
PERCENTAGE_MAX_ENTROPY = 75
TOP_FOLDER = "system_distributions/limited_entropy/"
FOLDER_PERCENTAGE_MAX_ENTROPY = "entropy{}/".format(PERCENTAGE_MAX_ENTROPY)
FOLDER_DISTRIBUTION_SHAPE_FORMAT = "{}var_{}states/"
FILENAME_DISTRIBUTION_FORMAT = "dist_{}_exact.npy"
MIN_NUMBER_VARIABLES = 2
MAX_NUMBER_VARIABLES = 7
NUMBER_OF_STATES = 3

FILENAME_INPUT_FORMAT = "input_dist_exact_{}.npy"
FILENAME_COND_OUTPUT_FORMAT = "cond_output_dist_exact_{}.npy" 

NUMBER_OF_SAMPLES = 100
START_NUMBER_SAMPLES = 200
END_NUMBER_SAMPLES = 300

MAX_CHANGE = 10**(-3)



#### generate the folder structure (if needed)

In [None]:
if not os.path.exists(PATH+TOP_FOLDER):
    os.makedirs(PATH+TOP_FOLDER)
    
if not os.path.exists(PATH+TOP_FOLDER+FOLDER_PERCENTAGE_MAX_ENTROPY):
    os.makedirs(PATH+TOP_FOLDER+FOLDER_PERCENTAGE_MAX_ENTROPY)

for number_of_variables in range(MIN_NUMBER_VARIABLES, MAX_NUMBER_VARIABLES+1, 1):
    directory = (PATH+TOP_FOLDER+FOLDER_PERCENTAGE_MAX_ENTROPY
                 + FOLDER_DISTRIBUTION_SHAPE_FORMAT.format(number_of_variables, NUMBER_OF_STATES))
                 
    if not os.path.exists(directory):
        os.makedirs(directory)


In [None]:
import limited_entropy_distributions

print(np.log2(3**2) * 0.8)
dist = limited_entropy_distributions.get_dist_percentage_max_entropy_exactly([3]*2, 0.8, 10**(-3))
print(np.sum(dist))
print(stats.entropy(dist.flatten(), base=2))

In [None]:

RUN = True
if RUN:
    for number_of_variables in range(MIN_NUMBER_VARIABLES, MAX_NUMBER_VARIABLES+1, 1):
        print("number of variables {}".format(number_of_variables))
        directory = (
            PATH + TOP_FOLDER + FOLDER_PERCENTAGE_MAX_ENTROPY
            + FOLDER_DISTRIBUTION_SHAPE_FORMAT.format(number_of_variables, NUMBER_OF_STATES)
        )
        dist_shape = [NUMBER_OF_STATES]*number_of_variables
        goal_entropy = np.log2(NUMBER_OF_STATES**number_of_variables) * PERCENTAGE_MAX_ENTROPY/100.0
        print("goal entropy {}".format(goal_entropy))
        for sample_number in range(START_NUMBER_SAMPLES, END_NUMBER_SAMPLES, 1):
            good_entropy = False
            while not good_entropy:
                dist = limited_entropy_distributions.get_dist_percentage_max_entropy_exactly(
                    dist_shape, PERCENTAGE_MAX_ENTROPY/100.0, MAX_CHANGE
                )
                if abs(stats.entropy(dist.flatten(), base=2)/goal_entropy - 1) < 0.05:
                    good_entropy = True
                    
            print("sample number {} entropy {}".format(sample_number, stats.entropy(dist.flatten(), base=2)))

            #save distribution
            file_name = FILENAME_DISTRIBUTION_FORMAT.format(sample_number)
            with open(directory+file_name, 'wb') as f:
                np.save(f, dist)


#### From the joint distribution take the input distribution (the marginal of the first N-1 axis) and the output (the Nth axis) conditioned on the input

In [None]:
TEST = True
RUN = True
limited_entropy.PRINT = False

evolutionary_params_generate_conditional_output_stats = {
    "number_of_generations": 700,
    "population_size": 20,
    "number_of_children": 80,
    "generational": False,
    "mutation_method": "step_wise_after",
    "number_of_mutations": 3,
    "mutation_size": 0.01,
    "parent_selection_mode": "rank_exponential",
    "early_stopping_criterium": 0.01
}

def produce_conditional_states(entropy_size, evolution_params, number_of_states=NUMBER_OF_STATES):
    while True:
        new_entropy_size = entropy_size + np.random.uniform(-0.15*entropy_size, 0.15*entropy_size) 
        yield limited_entropy.get_dist_with_entropy(
            NUMBER_OF_STATES, new_entropy_size, 
            evolution_params, verbose=False    
        )

if RUN:
    min_number_of_variables = max(MIN_NUMBER_VARIABLES, 2)
    for number_of_variables in range(min_number_of_variables, MAX_NUMBER_VARIABLES+1, 1):
        print("number of variables {}".format(number_of_variables))
        directory = (
            PATH + TOP_FOLDER + FOLDER_PERCENTAGE_MAX_ENTROPY
            + FOLDER_DISTRIBUTION_SHAPE_FORMAT.format(number_of_variables, NUMBER_OF_STATES)
        )
        dist_shape = [NUMBER_OF_STATES]*number_of_variables
        input_labels = set(range(number_of_variables-1))
        output_label = set([number_of_variables-1])
        for sample_number in range(START_NUMBER_SAMPLES, END_NUMBER_SAMPLES, 1):
            print("sample number {}".format(sample_number))
            #load the distribution
            file_name = FILENAME_DISTRIBUTION_FORMAT.format(sample_number)
            with open(directory+file_name, 'rb') as f:
                joint = np.load(f)
                
            #produce the marginal
            joint_dist = probability_distributions.ProbabilityArray(joint)
            input_dist = joint_dist.marginalize(input_labels)
            
            #save the marginal (representing the input distribution)
            file_name = FILENAME_INPUT_FORMAT.format(sample_number)
            with open(directory+file_name, 'wb') as f:
                np.save(f, input_dist)
                
            print("number of input states with zero probability {}".format(
                input_dist.flatten().shape[0]-np.count_nonzero(input_dist.flatten())
            ))
            
            #produce the conditional output
            entropies = [stats.entropy(joint[tuple(state)], base=2) for state in np.argwhere(input_dist != 0)]
            average_entropy = np.mean(entropies)
            print(average_entropy)
            generator = produce_conditional_states(
                average_entropy, evolutionary_params_generate_conditional_output_stats
            )
            cond_output , mar_labels, cond_labels = joint_dist.find_conditional_accounting_for_zero_marginals(
                output_label, input_labels, generator
            )
            
            #save cond_output
            file_name = FILENAME_COND_OUTPUT_FORMAT.format(sample_number)
            with open(directory+file_name, 'wb') as f:
                np.save(f, cond_output)
                
            if TEST:
                computed_joint = probability_distributions.compute_joint(input_dist, cond_output, cond_labels)
                if not np.allclose(computed_joint, joint):
                    raise ValueError()
                
            
