In [1]:
%load_ext line_profiler
%load_ext snakeviz

In [2]:
from pathlib import Path

import matplotlib.pyplot as plt
import numba as nb
import numpy as np
import pandas as pd
import seaborn as sns
from src.config import BLD, SRC
from src.create_initial_states.task_build_full_params import (
    _convert_index_to_int_where_possible,
)
from src.simulation.plotting import plot_incidences, style_plot

In [38]:
states = pd.read_parquet(BLD / "data" / "initial_states.parquet")
states["date"] = pd.Timestamp("2021-03-01")
states["infectious"] = np.random.choice([True, False], size=len(states), p=[0.99, 0.01])
states["symptomatic"] = np.random.choice(
    [True, False], size=len(states), p=[0.99, 0.01]
)
states["cd_infectious_true"] = np.random.choice(
    [-104, 0, -1, 3], size=len(states), p=[0.97, 0.01, 0.01, 0.01]
)
states["pending_test"] = np.random.choice(
    [True, False], size=len(states), p=[0.03, 0.97]
)
states["knows_immune"] = np.random.choice(
    [True, False], size=len(states), p=[0.07, 0.93]
)
states["currently_infected"] = states.eval("(infectious | symptomatic | (cd_infectious_true >= 0))")

contacts = pd.Series(1, index=states.index)

# Reduce Contacts on Condition

**Klara**
- 35.5% go to reduce_recurrent_model
- in reduce_recurrent_model it looks as if 85% are on the last line of the docstring. That's strange.

**Tobi**
- 40% of time is caused by evaluating the condition. Why dont we precompute the conditions and pass the column as the condition.
- Replace ``np.random.choice([True, False], ...)`` with ``boolean_choices`` which halves runtime.

In [39]:
def reduce_contacts_on_condition(
    contacts, states, multiplier, condition, seed, is_recurrent
):
    """Reduce contacts for share of population for which condition is fulfilled.

    The subset of contacts for which contacts are reduced is specified by the condition
    and whoever has a positive number of contacts. Then, a share of individuals in the
    subset is sampled and the contacts are set to 0.

    Args:
        contacts (pandas.Series): The series with contacts.
        states (pandas.DataFrame): The states of one day passed by sid.
        multiplier (float): The share of people who maintain their contacts
            despite condition.
        condition (str): Condition which defines the subset of individuals who
            potentially reduce their contacts.
        seed (int)

    """
    np.random.seed(seed)
    if is_recurrent:
        reduced = reduce_recurrent_model(states, contacts, seed, multiplier)
    else:
        reduced = multiplier * contacts
    is_condition_true = states.eval(condition)
    reduced = reduced.where(is_condition_true, contacts)
    return reduced


def reduce_recurrent_model(states, contacts, seed, multiplier):
    """Reduce the number of recurrent contacts taking place by a multiplier.

    For recurrent contacts only whether the contacts Series is > 0 plays a role.
    Therefore, simply multiplying the number of contacts with it would not have
    an effect on the number of contacts taking place. Instead we make a random share of
    individuals scheduled to participate not participate.

    This function returns a Series of 0s and 1s.

    Args:
        multiplier (float or pd.Series): Must be smaller or equal to one. If a
            Series is supplied the index must be dates.

    """
    np.random.seed(seed)
    if isinstance(multiplier, pd.Series):
        date = get_date(states)
        multiplier = multiplier[date]

    contacts = contacts.to_numpy()
    resampled_contacts = np.random.choice(
        [1, 0], size=len(states), p=[multiplier, 1 - multiplier]
    )
    reduced = np.where(contacts > 0, resampled_contacts, contacts)
    return pd.Series(reduced, index=states.index)

In [40]:
%%timeit

reduce_contacts_on_condition(
    contacts,
    states,
    multiplier=0.5,
    condition="occupation == 'working'",
    seed=99,
    is_recurrent=False,
)

29.2 ms ± 1.81 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [41]:
%lprun -f reduce_contacts_on_condition reduce_contacts_on_condition(contacts,states,multiplier=0.5,condition="occupation == 'working'",seed=99,is_recurrent=False)

In [42]:
%%timeit

reduce_contacts_on_condition(
    contacts,
    states,
    multiplier=0.5,
    condition="occupation == 'working'",
    seed=99,
    is_recurrent=True,
)

50.5 ms ± 1.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
%lprun -f reduce_contacts_on_condition reduce_contacts_on_condition(contacts,states,multiplier=0.5,condition="occupation == 'working'",seed=99,is_recurrent=True)

In [9]:
%%timeit

reduce_recurrent_model(states, contacts, 323, 0.5)

23.6 ms ± 156 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
%lprun -f reduce_recurrent_model reduce_recurrent_model(states, contacts, 323, 0.5)

In [11]:
def boolean_choices(truth_probabilities):
    """Sample boolean value with probabilities given for ``True``.

    Args:
        truth_probabilities (float): Must be between 0 and 1.

    Returns:
        bool: Boolean array.

    Example:
        >>> boolean_choice(np.array([1, 0]))
        array([ True, False])

    """
    u = np.random.uniform(0, 1, size=len(truth_probabilities))
    return u <= truth_probabilities

In [12]:
%%timeit

np.random.choice([True, False], size=len(states), p=[0.5, 1 - 0.5])

21.1 ms ± 188 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
%%timeit

boolean_choices(np.full(len(states), 0.5))

10.2 ms ± 36.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
from numba import njit

@njit
def boolean_choices2(truth_probability, size=None):
    u = np.random.uniform(0, 1, size=size)
    return u <= truth_probability
    
        

In [19]:
%%timeit 


boolean_choices2(0.5, len(states))

6.61 ms ± 40.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Find Educ Workers With Zero Students

**Klara**
- irgendwie sind die falschen Zeilen bei "find_educ_workers_with_zero_students" in der Textdatei gelandet, 
  die gar nicht zu der Funktion gehören 
  
- in _find_size_zero_classes gehen 80% auf die class_sizes = ...`.groupby`Zeile

In [20]:
def _find_educ_workers_with_zero_students(contacts, states, group_id_column):
    """Return educ_workers whose classes / groups don't have any children in them.

    Returns:
        has_no_class (pandas.Series): boolean Series with the
            same index as states. True for educ_workers whose classes / groups
            don't have any children in them.

    """
    size_0_classes = _find_size_zero_classes(contacts, states, group_id_column)
    has_no_class = states["educ_worker"] & states[group_id_column].isin(size_0_classes)
    return has_no_class


def _find_size_zero_classes(contacts, states, col):
    students_group_ids = states[col][~states["educ_worker"]]
    students_contacts = contacts[~states["educ_worker"]]
    # the .drop(-1) is needed because we use -1 instead of NaN to identify
    # individuals not participating in a recurrent contact model
    class_sizes = students_contacts.groupby(students_group_ids).sum().drop(-1)
    size_zero_classes = class_sizes[class_sizes == 0].index
    return size_zero_classes

In [21]:
school_contacts = states["occupation"].isin(["school", "school_teacher"])

In [22]:
%%timeit

_find_educ_workers_with_zero_students(
    contacts=school_contacts, states=states, group_id_column="school_group_id_1"
)

44.4 ms ± 774 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [23]:
%lprun -f _find_educ_workers_with_zero_students _find_educ_workers_with_zero_students(contacts=school_contacts, states=states, group_id_column="school_group_id_1")

In [24]:
%%timeit

_find_size_zero_classes(
    contacts=school_contacts, states=states, col="school_group_id_2"
)

44.3 ms ± 1.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [25]:
%lprun -f _find_size_zero_classes _find_size_zero_classes(contacts=school_contacts, states=states, col="school_group_id_2")

# Reduce Work Model

**Klara**
- 1/3 went into asserting that the Länder in the states and thresholds fit together 

    $\Rightarrow$ I moved that to check_initial_states and only check that there are no NaN after `.map`
    and replaced it with a check that there are no NaN after the .map operation. That takes 8% of the time
    and could be removed.
    
    
$\Rightarrow$ `.where` statements take ~40% of the time. the `.map` statement 30%.

In [26]:
from sid.time import get_date


def reduce_work_model(states, contacts, seed, multiplier, is_recurrent):  # noqa: U100
    """Reduce contacts for the working population.

    Args:
        multiplier (float, pandas.Series, pandas.DataFrame):
            share of workers that have work contacts.
            If it is a Series or DataFrame, the index must be dates.
            If it is a DataFrame the columns must be the values of
            the "state" column in the states.
        is_recurrent (bool): True if the contact model is recurernt

    """
    if isinstance(multiplier, (pd.Series, pd.DataFrame)):
        date = get_date(states)
        multiplier = multiplier.loc[date]

    msg = f"Work multiplier not in [0, 1] on {get_date(states)}"
    if isinstance(multiplier, (float, int)):
        assert 0 <= multiplier <= 1, msg
    else:
        assert (multiplier >= 0).all(), msg
        assert (multiplier <= 1).all(), msg

    threshold = 1 - multiplier
    if isinstance(threshold, pd.Series):
        threshold = states["state"].map(threshold.get)
        # this assert could be skipped because we check in
        # task_check_initial_states that the federal state names overlap.
        assert threshold.notnull().all()

    above_threshold = states["work_contact_priority"] > threshold

    if not is_recurrent:
        reduced_contacts = contacts.where(above_threshold, 0)
    if is_recurrent:
        reduced_contacts = contacts.where(above_threshold, False)
    return reduced_contacts

In [27]:
is_true = boolean_choices(np.full(len(contacts), 0.5))

In [28]:
%timeit contacts.where(is_true, 0)

5.67 ms ± 74.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [29]:
%timeit contacts.loc[is_true] = 5

6.83 ms ± 97.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [30]:
%%timeit

reduce_work_model(
    states=states, contacts=contacts, seed=111, multiplier=0.4, is_recurrent=True
)

34.7 ms ± 88.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
%lprun -f reduce_work_model reduce_work_model(states=states, contacts=contacts, seed=111, multiplier=0.4, is_recurrent=True)

# Demand Test

- 70% of time goes to `_scale_demand_up_or_down`

- of `_scale_demand_up_or_down` >90% go to `_decrease_test_demand`

- of `_decrease_test_demand`: 93.7% go to `states[demanded].query(f"age_group_rki == '{group}'").index`

- of `_increase_test_demand`: >90% go to `infected_untested = states.index[states.eval(selection_string) & ~demanded]`

In [43]:
import warnings
from sid.time import get_date


def _decrease_test_demand(demanded, states, n_to_remove, group):
    """Decrease the number of tests demanded in an age group by a certain number.

    This is called when the endogenously demanded tests (symptomatics + educ workers)
    already exceed the designated number of positive tests in an age group.

    """
    demanded = demanded.copy(deep=True)
    
    is_candidate = demanded.to_numpy() & (states["age_group_rki"] == group).to_numpy()

    demanding_test_in_age_group = demanded.index.to_numpy()[is_candidate]
    
    drawn = np.random.choice(
        a=demanding_test_in_age_group, size=n_to_remove, replace=False
    )
    demanded.loc[drawn] = False
    return demanded


def _increase_test_demand(demanded, states, n_undemanded_tests, group):
    """Randomly increase the number of tests demanded in an age group.
    This is the case where we have additional positive tests to distribute.

    """
    demanded = demanded.copy(deep=True)

    right_age_group = states["age_group_rki"] == group
    untested = ~states["pending_test"] & ~states["knows_immune"]
    condition = right_age_group & untested & states["currently_infected"]    
    infected_untested = states.index[condition & ~demanded]

    if len(infected_untested) >= n_undemanded_tests:
        drawn = np.random.choice(infected_untested, n_undemanded_tests, replace=False)
    else:
        date = get_date(states)
        warnings.warn(
            f"\n\nThe implied share_known_cases for age group {group} is >1 "
            f"on date {date.date()} ({date.day_name()}).\n\n"
        )
        drawn = infected_untested
    demanded.loc[drawn] = True
    return demanded

In [33]:
demanded = pd.Series(True, index=states.index)

In [34]:
%%timeit

_decrease_test_demand(demanded, states, n_to_remove=30, group="80-100")

3.93 ms ± 70.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [35]:
%lprun -f _decrease_test_demand _decrease_test_demand(demanded, states, n_to_remove=30, group="80-100")

In [44]:
%%timeit

_increase_test_demand(demanded, states, n_undemanded_tests=20, group="80-100")


The implied share_known_cases for age group 80-100 is >1 on date 2021-03-01 (Monday).




3.61 ms ± 90.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [45]:
%lprun -f _increase_test_demand _increase_test_demand(demanded, states, n_undemanded_tests=20, group="80-100")


The implied share_known_cases for age group 80-100 is >1 on date 2021-03-01 (Monday).




In [None]:

def convert_boolean_variable_to_target_frequency(df, col_name, groupby_var, target_frequency):
    df = df[[col_name, groupby_var]].copy()
    start_frequency = df.groupby(groupby_var)[col_name].mean()
    diff = target_frequency - start_frequency

    to_replace = (np.sign(diff) < 0).to_dict()
    candidate_value = df[groupby_var].map(to_replace.get)
    is_switching_candidate = (df[col_name] == candidate_value)
    share_candidates = is_switching_candidate.groupby(df[groupby_var]).mean()
    to_replace = (np.abs(diff) / share_candidates).to_dict()
    switching_prob = df[groupby_var].map(to_replace.get)

    switches = boolean_choices(switching_prob) & is_switching_candidate.to_numpy()

    out = df[col_name].where(~switches, ~df[col_name])

    return out




In [None]:
%%timeit 
convert_boolean_variable_to_target_frequency(states, "educ_worker", "age_group_rki", sr)

In [None]:
states["test"] = convert_boolean_variable_to_target_frequency(states, "educ_worker", "age_group_rki", sr)
states.groupby("age_group_rki")["test"].mean().round(2)


In [None]:
states["educ_worker"].groupby

In [None]:
states["educ_worker"].groupby

In [None]:
pd.Grouper(states["age_group_rki"])