# Cluster Size Distribution Analysis
This notebook showcases the usage of the analysis functions in analysis.py regarding cluster size distributions. This will be shown both using directly simulated data, as well as loaded data from previously generated data.

## Setting up

In [None]:
import sys
from pathlib import Path
project_root = Path("..").resolve()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import src.CA_model as CA
import src.analysis as an
import src.utils as ut
import numpy as np
import matplotlib.pyplot as plt
import time
from importlib import reload

#### Reload our own modules in case they are updated

In [None]:
# run to reload CA_model.py, analysis.py and utils.py for updated code
reload(CA)
reload(an)
reload(ut)

## Distribution of a single dataset
The following cells show the process of generating the cluster size distribution of a single simulation. This data is generated directly for convenience, instead of saved and loaded first.

In [None]:
# parameter settings
size = 100                          # width and height of the grid
p = 0.5                             # starting fraction of vegetation
update_rule = CA.update_Scanlon2007 # function containing update rule
true_frac=0.2                       # 'natural' (equilibrium) fraction of vegetation
k=3                                 # strength of local interactions
M=10                                # radius of neighbourhood
N_steps=200                         # number of iterations
skip=0                              # iterations to skip (equilibration period)
seed=0

grids = CA.evolve_CA(
    size=size,
    p=p,
    update_rule=update_rule,
    true_frac=true_frac,
    k=k,
    M=M,
    N_steps=N_steps,
    skip=skip,
    seed=seed,
)

### Fit results and statistics

In [None]:
size_list, fit = an.cluster_sizes(grids)
alpha = fit.truncated_power_law.alpha
s_char = 1 / (fit.truncated_power_law.Lambda)
R, p = fit.distribution_compare("truncated_power_law", "exponential", normalized_ratio=True)

print("Scaling exponent: ", alpha)
print("Characteristic length: ", s_char)
print("Loglikelihood ratio (if positive, (truncated) power law more likely than exponential): ", R)
print("Significance value: ", p)

### Plot of the data and fit with truncated power law

In [None]:
fig = an.plot_cluster_size_distr(size_lists=[size_list], fits=[fit])

### Average distribution of several datasets
To showcase how data of different simulations (with different seeds, but the same set of parameters) can be combined into one set, which can then again be passed on to cluster_sizes(). The main idea is all grids of all simulations are combined into one list

In [None]:
size = 100                          # width and height of the grid
p = 0.5                             # starting fraction of vegetation
update_rule = CA.update_Scanlon2007 # function containing update rule
true_frac=0.2                       # 'natural' (equilibrium) fraction of vegetation
k=3                                 # strength of local interactions
M=10                                # radius of neighbourhood
N_steps=200                         # number of iterations
skip=100                            # iterations to skip (equilibration period)
starting_seed=0

N_evolutions = 5                    # number of full evolutions to generate for this set of parameters
all_grids = []

for i in range(N_evolutions):
    start = time.time()
    seed = starting_seed+i
    grids = CA.evolve_CA(
        size=size,
        p=p,
        update_rule=update_rule,
        true_frac=true_frac,
        k=k,
        M=M,
        N_steps=N_steps,
        skip=skip,
        seed=seed,
    )
    # grids = ut.load_data(size, update_rule, np.round(true_fracs[i],2), k, M, N_steps, skip, starting_seed+i)
    all_grids.append(grids)
    end = time.time()
    print(f"Grid evolution {i+1} out of {N_evolutions} completed in {end-start} seconds.")

In [None]:
# flatten the list of lists into a 1D list of grids
combined_grids = [grid for grid_list in all_grids for grid in grid_list]

In [None]:
size_list, fit = an.cluster_sizes(combined_grids)
alpha = fit.truncated_power_law.alpha
s_char = 1 / (fit.truncated_power_law.Lambda)
R, p = fit.distribution_compare("truncated_power_law", "exponential", normalized_ratio=True)

print("Scaling exponent: ", alpha)
print("Characteristic length: ", s_char)
print("Loglikelihood ratio (if positive, (truncated) power law more likely than exponential): ", R)
print("Significance value: ", p)

In [None]:
fig = an.plot_cluster_size_distr(size_lists=[size_list], fits=[fit])

# Several cluster size distributions in one plot
In the previous cells, we plotted a single cluster size distribution in each plot. In the following we show how to plot many distributions together in a single plot, corresponding to different parameter settings. Since this quickly requires a lot of data, the following is based on loaded data instead of in-line generated data. For generating the required loaded data, see 00_data_management.ipynb

In [None]:
# parameters for which data should be analysed
size = 500                          # width and height of the grid
p = 0.5                             # starting fraction of vegetation
update_rule = CA.update_Scanlon2007 # function containing update rule
true_fracs=np.arange(0.05,0.7,0.05) # 'natural' (equilibrium) fraction of vegetation
k=3                                 # strength of local interactions
M=20                                # radius of neighbourhood
N_steps=200                         # number of iterations
skip=100                            # iterations to skip (equilibration period)
starting_seed=0

size_lists = []
fits = []

for i in range(len(true_fracs)):
    loaded_grids = ut.load_data_wo_phi(size, update_rule, np.round(true_fracs[i],2), k, M, N_steps, skip, starting_seed+i)
    # retrieve the cumulative cluster size distribution
    size_list, fit = an.cluster_sizes(loaded_grids)
    size_lists.append(size_list)
    fits.append(fit)

In [None]:
fig = an.plot_cluster_size_distr(size_lists=size_lists, fits=fits, params=true_fracs, param_name=r"$f^{*}$")
figname = f"DISTR_{update_rule.__name__}_size={size}_Nsteps={N_steps}_skip={skip}_truefrac={np.round(true_fracs[0],2)}to{np.round(true_fracs[-1],2)}_k={k}_M={M}_seed={starting_seed}.pdf"
fig.savefig("../results/figures/" + figname)

In [None]:
fig = an.plot_alpha_vs_true_frac(fits=fits, true_fracs=true_fracs)
figname = f"ALPHA_{update_rule.__name__}_size={size}_Nsteps={N_steps}_skip={skip}_truefrac={np.round(true_fracs[0],2)}to{np.round(true_fracs[-1],2)}_k={k}_M={M}_seed={starting_seed}.pdf"
fig.savefig("../results/figures/" + figname, bbox_inches='tight')

In [None]:
for plot_all in [True, False]:
    fig = an.plot_fit_statistics_vs_true_frac(fits=fits, true_fracs=true_fracs, plot_all=plot_all)
    figname = f"STATISTICS_{update_rule.__name__}_size={size}_Nsteps={N_steps}_skip={skip}_truefrac={np.round(true_fracs[0],2)}to{np.round(true_fracs[-1],2)}_k={k}_M={M}_seed={starting_seed}_{plot_all}.pdf"
    fig.savefig("../results/figures/" + figname, bbox_inches='tight')

## Average of several datasets for different sets of parameters
As the icing on the cake, this is a combination of the previous part of the notebook. Here we are plotting different datasets in one plot, where each one is an average over several iterations. Again, the data is loaded from previously generated datasets.

In [None]:
# parameters for which data should be analysed
size = 500                          # width and height of the grid
p = 0.5                             # starting fraction of vegetation
update_rule = CA.update_Scanlon2007 # function containing update rule
true_fracs=np.arange(0.05,0.7,0.05) # 'natural' (equilibrium) fraction of vegetation
# true_fracs=[0.45]
k=3                                 # strength of local interactions
M=20                                # radius of neighbourhood
N_steps=200                         # number of iterations
skip=100                            # iterations to skip (equilibration period)
starting_seeds=[0,100,200]          # starting seeds used for the different iterations per set of parameters

size_lists = []
fits = []

for i in range(len(true_fracs)):
    all_grids = []
    for starting_seed in starting_seeds:
        loaded_grids = ut.load_data_wo_phi(size, update_rule, np.round(true_fracs[i],2), k, M, N_steps, skip, starting_seed+i)
        all_grids.append(loaded_grids)
    # flatten the list of lists into a 1D list of grids
    combined_grids = [grid for grid_list in all_grids for grid in grid_list]

    # retrieve the cumulative cluster size distribution
    size_list, fit = an.cluster_sizes(combined_grids)
    size_lists.append(size_list)
    fits.append(fit)

In [None]:
fig = an.plot_cluster_size_distr(size_lists=size_lists, fits=fits, params=true_fracs, param_name=r"$f^{*}$")
figname = f"DISTR_{update_rule.__name__}_size={size}_Nsteps={N_steps}_skip={skip}_truefrac={np.round(true_fracs[0],2)}to{np.round(true_fracs[-1],2)}_k={k}_M={M}_Nseeds={len(starting_seeds)}.pdf"
fig.savefig("../results/figures/" + figname)

In [None]:
fig = an.plot_alpha_vs_true_frac(fits=fits, true_fracs=true_fracs)
figname = f"ALPHA_{update_rule.__name__}_size={size}_Nsteps={N_steps}_skip={skip}_truefrac={np.round(true_fracs[0],2)}to{np.round(true_fracs[-1],2)}_k={k}_M={M}_Nseeds={len(starting_seeds)}.pdf"
fig.savefig("../results/figures/" + figname, bbox_inches='tight')

In [None]:
for plot_all in [True, False]:
    fig = an.plot_fit_statistics_vs_true_frac(fits=fits, true_fracs=true_fracs, plot_all=plot_all)
    figname = f"STATISTICS_{update_rule.__name__}_size={size}_Nsteps={N_steps}_skip={skip}_truefrac={np.round(true_fracs[0],2)}to{np.round(true_fracs[-1],2)}_k={k}_M={M}_Nseeds={len(starting_seeds)}_{plot_all}.pdf"
    fig.savefig("../results/figures/" + figname, bbox_inches='tight')