# Import Required Libraries
Import the necessary libraries, including pandas and matplotlib.

In [None]:
# Importing required librariescd 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os import makedirs
from os.path import isdir
from gorgona.generator import BacteriaAmountGenerator

# Distribution of average amount for all bacteria

In [None]:
# Define the input file and threshold
input_file = 'data/input_files/average_day7.txt'
threshold = 0

# Create an object of the BacteriaAmountGenerator class
bacteria_amount_generator = BacteriaAmountGenerator(input_file, 
													threshold, 
													plot_figs=True, 
													figures_path="figures")
bacteria_amount_df = bacteria_amount_generator.num_bac_genomes.copy()

# Limiting bacteria types for the workshop

In [None]:
# Define the input file and list of bacteria
input_file = 'data/input_files/average_day7.txt'
threshold = 0
list_of_bacteria = ['D300495:bin_000003', 'D300472:bin_000003', 'TG5_21:bin_000004', 'D300519:bin_000004', 
                    'D300442:bin_000011', 'D300492:bin_000002', 'TG2_35:bin_000021', 'D300433:bin_000009', 
                    'D300457:bin_000005', 'D300444:bin_000010', 'D300492:bin_000003']
# Create an object of the BacteriaAmountGenerator class
bacteria_amount_generator = BacteriaAmountGenerator(input_file, 
													threshold, 
													plot_figs=True, 
													figures_path="figures")
bacteria_amount_df = bacteria_amount_generator.num_bac_genomes.copy()
bacteria_amount_df = bacteria_amount_df[bacteria_amount_df["genome"].isin(list_of_bacteria)].copy()

In [None]:
bacteria_amount_df

In [None]:
bacteria_amount_df['count'].sum()

Next step is to come up with the set of **multipliers** in order to have amount of bacteria that will be closer to the density of bacteria observed in real experiment. To calculate it, you have to consider both **simulation box volume** and **volume of a single bacterium**.

Since units in our simulations are relative, let's assume that 1 µm is equal to 0.04 of our simulation length units, each bacterium is a sphere with radius $r$ and simulation box has size $(l, l, h)$, where $l$ is width/depth of simulation box and $h$ is its height, which spanning the space where bacteria initially located.

Regarding density of bacteria, Antton's calculation for the caecum of a 35 day-old chicken is **33,213,904 bacterial cells per 1 $mm^{3}$.

Let's assume the a single bacterium will be 1–3 µm in diameter (0.04–0.12 in simulation units).

Let's calculate upper boundary on estimation simulation box size, assuming that $l=h$:


In [None]:
expected_bac_amount_per_mm3 = 33213904 # per 1 mm^3
expected_volume_in_micrometers3 = 1000**3

In [None]:
expected_density_per_micrometer3 = expected_bac_amount_per_mm3 / expected_volume_in_micrometers3
expected_density_per_micrometer3

In [None]:
# observed_size_sim_units = (7,7,3)
observed_size_sim_units = (1,1,1)
bacterium_diameter_in_micrometers = 1 # in micrometers # 1 micrometer = 0.04 sim units

sim_to_micrometer_factor = 25
micrometer_to_sim_factor = 0.04
observed_size_in_micrometers3 = tuple(i * sim_to_micrometer_factor for i in observed_size_sim_units)
observed_volume_in_micrometers3 = np.prod(observed_size_in_micrometers3)
observed_bacterium_amount = expected_density_per_micrometer3 * observed_volume_in_micrometers3
observed_bacterium_amount

In [None]:
allowed_space_for_one_bacterium = observed_volume_in_micrometers3 / observed_bacterium_amount
allowed_space_for_one_bacterium # in micrometers^3

In [None]:
qubic_volume_of_bacterium = (bacterium_diameter_in_micrometers + bacterium_diameter_in_micrometers / 2) ** 3 # increased by a radius to avoid overlapping in simulation
qubic_volume_of_bacterium

In [None]:
assert qubic_volume_of_bacterium < allowed_space_for_one_bacterium
# if qubic_volume_of_bacterium almost equal to allowed_space_for_one_bacterium but still less,
# it's recommended to increase simulation box size or decrease bacterium diameter

It's better to use (10,10,10) simulation box with each bacterium of diameter 1 micrometer (0.04). It gives a total amount of bacteria of 518,967

Now, according to this number, we calculate multiplier:

In [None]:
multiplier = observed_bacterium_amount / bacteria_amount_df['count'].sum()
multiplier

In [None]:
bacteria_amount_df['count'].sum()

# Generate bacteria amount for calculated multiplier

In [None]:
# Define the input file and threshold
input_file = 'data/input_files/average_day7.txt'
threshold = 0

# Create an object of the BacteriaAmountGenerator class
bacteria_amount_generator = BacteriaAmountGenerator(input_file, threshold, plot_figs=False, figures_path="figures")
bacteria_amount_df = bacteria_amount_generator.num_bac_genomes.copy()
bacteria_amount_df = bacteria_amount_df[bacteria_amount_df["genome"].isin(list_of_bacteria)].copy()

# multiplier
# 25000 / bacteria_amount_df['count'].sum()
multiplier = observed_bacterium_amount / bacteria_amount_df['count'].sum() 

# Generate the bacteria amount given the threshold and multiplier
output_dir = 'data/01_bacteria_amount'

def generate_bacteria_amount(bacteria_amount_df, 
                             output_dir_path: str = "data/01_bacteria_amount/", 
                             multiplier_on_bacteria_amount: int = 21):
    bacteria_amount_df['count'] = [round(x) for x in bacteria_amount_df['count'] * multiplier_on_bacteria_amount]
    total_num_bac = bacteria_amount_df['count'].sum()
    if not isdir(output_dir_path):
        makedirs(output_dir_path)
    bacteria_amount_df.to_csv(f"{output_dir_path}/bac_amount_workshop.tsv", sep='\t', index=None)
    return bacteria_amount_df.copy()

result_df = generate_bacteria_amount(bacteria_amount_df, output_dir, multiplier)

In [None]:
result_df['count'].sum()

In [None]:
assert result_df['count'].sum() == int(observed_bacterium_amount)

In [None]:
observed_bacterium_amount

In [None]:
result_df