In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.ensemble import RandomForestRegressor

In [3]:
from itertools import product

In [4]:
from commons import smoothen

# Preparations

## Loading the data

### Raw data

First we load the population size time series $N_i(t)$ :

In [5]:
Nt = smoothen(np.load("curves_raw.npy"), 10)

### Derivatives

We can compute the discrete derivatives $\frac{\Delta N_i(t)}{\Delta t} = \Delta N_i(t) = N_i(t+1) - N_i(t)$

In [6]:
dNdt = Nt[..., 1:] - Nt[..., :-1]

and the relative growth rates $\rho_i(t) = \frac{\Delta N_i(t)}{N_i(t)}$

In [7]:
rho = dNdt / Nt[..., :-1]

### Dimensions

We also get the dimensionality of our data :

In [8]:
n_plates, n_rows, n_columns, n_points = rho.shape
plates, rows, columns, points = map(np.arange, rho.shape)

## Train/Test split

We partition our population grids (32 x 48) into non-overlapping 2 x 2 lattices, from which we choose every bottom-right corner to be test data while the other ones are training data.
Here we create a 32 x 48 matrix of booleans representing whether a grid location is part of the test set :

In [9]:
test = np.empty((n_rows, n_columns)).astype(bool)
for r, c in product(rows, columns):
    test[r, c] = (r % 2 and c % 2)

## Maximum growth

One of our regression problems involves predictions of maximum growth $\rho_\text{max}$ and its timing $t_{\rho_\text{max}}$ :

In [10]:
growth_max  = rho.max(axis = 3)
growth_tmax = rho.argmax(axis = 3)

## Combining everything into one `pd.DataFrame`

For convenience, let's convert everything into 1 data frame (rows : populations, columns : features) :

In [11]:
factors   = pd.DataFrame(
    data  = {
        "test": np.tile(test.reshape(-1), n_plates),
        "final population size": Nt[..., -1].reshape(-1),
        "maximum growth rate": growth_max.reshape(-1),
        "maximum growth time": growth_tmax.reshape(-1)
    },
    index = pd.MultiIndex.from_product(
        (plates+1, rows+1, columns+1),
        names = ("plate", "row", "column")
    )
).reset_index()

for t in points:
    factors[f"N({t})"] = Nt[..., t].reshape(-1)
    factors[f"dN/dt({t})"] = dNdt[..., t].reshape(-1)
    factors[f"rho({t})"] = rho[..., t].reshape(-1)

  factors[f"dN/dt({t})"] = dNdt[..., t].reshape(-1)
  factors[f"rho({t})"] = rho[..., t].reshape(-1)
  factors[f"N({t})"] = Nt[..., t].reshape(-1)
  factors[f"dN/dt({t})"] = dNdt[..., t].reshape(-1)
  factors[f"rho({t})"] = rho[..., t].reshape(-1)
  factors[f"N({t})"] = Nt[..., t].reshape(-1)
  factors[f"dN/dt({t})"] = dNdt[..., t].reshape(-1)
  factors[f"rho({t})"] = rho[..., t].reshape(-1)
  factors[f"N({t})"] = Nt[..., t].reshape(-1)
  factors[f"dN/dt({t})"] = dNdt[..., t].reshape(-1)
  factors[f"rho({t})"] = rho[..., t].reshape(-1)
  factors[f"N({t})"] = Nt[..., t].reshape(-1)
  factors[f"dN/dt({t})"] = dNdt[..., t].reshape(-1)
  factors[f"rho({t})"] = rho[..., t].reshape(-1)
  factors[f"N({t})"] = Nt[..., t].reshape(-1)
  factors[f"dN/dt({t})"] = dNdt[..., t].reshape(-1)
  factors[f"rho({t})"] = rho[..., t].reshape(-1)
  factors[f"N({t})"] = Nt[..., t].reshape(-1)
  factors[f"dN/dt({t})"] = dNdt[..., t].reshape(-1)
  factors[f"rho({t})"] = rho[..., t].reshape(-1)
  factors[f"N({t}

In [12]:
factors.to_csv("factors/original-nt.csv")

# Time-dependent regression tasks

## $N_i(t) \;\rightarrow\; \rho_i(t)$

We train and predict the regressor for every time point over the different populations, using their $N_i(t)$ as input against $\rho_i(t)$ :

In [13]:
predictions = pd.DataFrame(
    data    = np.empty((n_plates * n_rows * n_columns, n_points)),
    index   = pd.MultiIndex.from_product((plates, rows, columns), names = ("plate", "row", "column"))
)

for p in plates:
    df = factors[factors["plate"] == p+1]
    
    for t in points:
        _df = df[df["test"] == False]
        model = RandomForestRegressor().fit(_df[[f"N({t})"]], _df[f"rho({t})"])
        predictions.loc[p].loc[:, t] = model.predict(df[[f"N({t})"]])

We then save the predicted growth rates for reuse in scoring the predictions :

In [14]:
np.save("predictions/level-1_Nt.npy", predictions.values.reshape((n_plates, n_rows, n_columns, n_points)))

## $x_i \,,\, N_i(t) \;\rightarrow\; \rho_i(t)$

We then perform the regression with an additional input parameter, which is the location of a population (here its row and column) in the populations grid.
Additionally, we also use here RandomForestRegressor's `feature_importances_` method, which gives the a set of linearised weights for each input feature :

In [15]:
predictions = pd.DataFrame(
    data    = np.empty((n_plates * n_rows * n_columns, n_points)),
    index   = pd.MultiIndex.from_product((plates, rows, columns), names = ("plate", "row", "column"))
)
importances = pd.DataFrame(
    data    = np.empty((n_plates * 3, n_points)),
    index   = pd.MultiIndex.from_product((plates, ("row", "column", "N(t)")), names = ("plate", "factor")),
    columns = pd.Index(points, name = "t")
)

for p in plates:
    df = factors[factors["plate"] == p+1]
    
    for t in points:
        _df = df[df["test"] == False]
        model = RandomForestRegressor().fit(_df[["row", "column", f"N({t})"]], _df[f"rho({t})"])
        predictions.loc[p].loc[:, t] = model.predict(df[["row", "column", f"N({t})"]])
        importances.loc[p, t] = model.feature_importances_

We save the predicted growth rates and the importances of the factors ($N_i(t)$, row, column) :

In [16]:
np.save("predictions/level-1_position-Nt.npy", predictions.values.reshape((n_plates, n_rows, n_columns, n_points)))

In [17]:
importances.stack().reset_index().to_csv("importances/position-Nt.csv")

# General growth properties regressions

## $x_i \,,\, N(0) \,,\, \frac{dN_i}{dt}(0) \;\rightarrow\; N_i(t_\text{final}) \,,\, \rho_\text{max} \,,\, t_{\rho_\text{max}}$

For this regression task, we don't need the predicted output values, so we compute the prediction scores immediately :

In [18]:
in_values    = ["row", "column", "N(0)", "dN/dt(0)"]
final_values = ["final population size", "maximum growth rate", "maximum growth time"]

scores       = pd.DataFrame(
    data     = np.empty((n_plates * 2, len(final_values))),
    index    = pd.MultiIndex.from_product((plates, ("train", "test")), names = ("plate", "data set")),
    columns  = pd.Index(final_values, name = "final value")
)
importances  = pd.DataFrame(
    data     = np.empty((n_plates * len(in_values), len(final_values))),
    index    = pd.MultiIndex.from_product((plates, in_values), names = ("plate", "factor")),
    columns  = pd.Index(final_values, name = "final value")
)

for p in plates:
    df = factors[factors["plate"] == p+1]
    
    for final_value in final_values:
        model = RandomForestRegressor().fit(df[df["test"] == False][in_values], df[df["test"] == False][final_value])
        scores.loc[(p, "train"), final_value] = model.score(df[df["test"] == False][in_values], df[df["test"] == False][final_value])
        scores.loc[(p, "test"), final_value] = model.score(df[df["test"]][in_values], df[df["test"]][final_value])
        importances.loc[p, final_value] = model.feature_importances_

We then store the scores and linearised weights for each regression :

In [19]:
scores.to_csv("importances/multi-scores.csv")

In [20]:
importances.to_csv("importances/multi-importances.csv")