In [1]:
import datetime
from pathlib import Path
from typing import Dict, List, Union
import warnings
warnings.filterwarnings("ignore")

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib.ticker as ticker
from matplotlib.patches import Patch
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

%matplotlib inline
pd.options.mode.chained_assignment = None  # default='warn'
plt.rcParams.update({'figure.max_open_warning': 0})
PARAMS = {'legend.fontsize': 'xx-large',
          'legend.title_fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(PARAMS)

TIME_FORMAT = '%Y-%m-%d'
INT_TYPE = 'int32'

In [2]:
def load_data(locus_weights_path: Union[str, Path],
              temperature_path: Union[str, Path],
              locus_group_matching_path: Union[str, Path],
              fresh_water_dates_path: Union[str, Path],
              sw_fw_matching_path: Union[str, Path],
              sw_fw_matching_path_with_cnt: Union[str, Path],
              final_locus_weighted_path: Union[str, Path],
              mortality_path: Union[str, Path]) -> Dict[str, pd.DataFrame]:
    
    locus_weights = pd.read_csv(locus_weights_path)
    locus_weights.starttime = pd.to_datetime(locus_weights.starttime, format=TIME_FORMAT)
    locus_weights.endtime = pd.to_datetime(locus_weights.endtime, format=TIME_FORMAT)
    
    temperature = pd.read_csv(temperature_path)
#     temperature.event_date = pd.to_datetime(temperature.event_date, format=TIME_FORMAT)
#     temperature.locus_group_id=temperature.locus_group_id.astype(INT_TYPE)
#     temperature['event_year'] = temperature['event_date'].dt.year

    locus_group_matching = pd.read_csv(locus_group_matching_path)
    locus_group_matching = locus_group_matching.astype(INT_TYPE)

    fresh_water_dates = pd.read_csv(fresh_water_dates_path)
    for d in ['first_movement_date', 'first_feeding_date', 'shipout_date']:
        fresh_water_dates[d] = pd.to_datetime(fresh_water_dates[d], format=TIME_FORMAT)

    sw_fw_matching = pd.read_csv(sw_fw_matching_path)
    sw_fw_matching_with_cnt = pd.read_csv(sw_fw_matching_path_with_cnt)

    sw_fw_cols = ['target_seawater_locus_id',
                  'transport_date',
                  'ponding_date',
                  'pretransfer_fw_locus_population_id',
                  'fish_count_shipped_out',
                  'avg_weight_g_stocked']
    sw_fw_matching = sw_fw_matching[sw_fw_matching.origin_site_type == 'Freshwater'][sw_fw_cols]
    sw_fw_matching.pretransfer_fw_locus_population_id = sw_fw_matching.\
    pretransfer_fw_locus_population_id.astype(INT_TYPE)

    final_locus_weighted = pd.read_csv(final_locus_weighted_path)
    final_locus_weighted.event_date = pd.to_datetime(final_locus_weighted.event_date)

    mortality = pd.read_csv(mortality_path)
    mortality['transfer_date'] = pd.to_datetime(mortality['transfer_date'], format=TIME_FORMAT)
    mortality['transport_year']= mortality['transfer_date'].dt.year


    dataframes = {
        'temperature': temperature,
        'locus_weigts': locus_weights,
        'locus_group_matching': locus_group_matching,
        'final_locus_weighted': final_locus_weighted,
        'fresh_water_dates': fresh_water_dates,
        'sw_fw_matching': sw_fw_matching,
        'sw_fw_matching_with_cnt': sw_fw_matching_with_cnt,
        'mortality': mortality
    }
    
    return dataframes

def weighted_avg(x, weight, factor):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        tmp = x[[weight, factor]].dropna()
        weighted_sum = (tmp[weight] * tmp[factor]).sum()
        count_sum = tmp[weight].sum()
        return weighted_sum / count_sum
    
#this is to not calculate aggregates if at least one value on the main factor column is NaN
def apply_condition(group,main_factor,agg_function):
    if group[main_factor].isnull().any():
        return np.nan
    else:
        return agg_function(group[main_factor])


def create_ph_aggregated_factors(ph_df, main_factor, agg_function, function_name, env_type='ph'):
    """Creates features on final_locus_population_id_level"""
    
    after_ff_len = 4 * 7
    
    after_ff_period = ph_df['event_date'].between(
        ph_df['first_feeding_date'],
        ph_df['first_feeding_date'] + pd.to_timedelta(after_ff_len, unit='D')
    )

    before_transfer = ph_df['event_date'] < ph_df['shipout_date']

    after_vaccination = ph_df['event_date'] > ph_df['VAC_EVENT_DATE']
    before_vaccination = ph_df['event_date'] < ph_df['VAC_EVENT_DATE']

    after_first_feeding = ph_df['event_date'] > ph_df['first_feeding_date']

    week_before_vac = ph_df['event_date'] > ph_df['VAC_EVENT_DATE'] - pd.to_timedelta(1, unit='W')
    week_after_vac = ph_df['event_date'] < ph_df['VAC_EVENT_DATE'] + pd.to_timedelta(1, unit='W')

    agg_df = pd.DataFrame()
    
    agg_df[f'{env_type}-whole_period-{function_name}'] = ph_df\
    .groupby('final_locus_population_id').apply(lambda x: apply_condition(x, main_factor, agg_function))
    
    agg_df[f'{env_type}-after_first_feeding-{function_name}'] = ph_df[after_ff_period] \
    .groupby('final_locus_population_id').apply(lambda x: apply_condition(x, main_factor, agg_function))
    
    agg_df[f'{env_type}-transfer_feeding-{function_name}'] = ph_df[before_transfer & after_first_feeding] \
    .groupby('final_locus_population_id').apply(lambda x: apply_condition(x, main_factor, agg_function))

    agg_df[f'{env_type}-transfer_vaccination-{function_name}'] = ph_df[before_transfer & after_vaccination]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x, main_factor, agg_function))

    agg_df[f'{env_type}-vaccination-feeding-{function_name}'] = ph_df[before_vaccination & after_first_feeding]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x, main_factor, agg_function))

    agg_df[f'{env_type}-around_vaccination-{function_name}'] = ph_df[week_after_vac & week_before_vac]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x, main_factor, agg_function))

    
    mean_df = pd.concat([agg_df.mean(), agg_df.std()], axis=1).rename(columns={0: 'mean', 1: 'std'})

    for col in agg_df.columns[:]:
        agg_df[f'{col}-from_mean'] = agg_df[col] - mean_df.loc[col]['mean']
        agg_df[f'{col}-from_mean-abs'] = np.abs(agg_df[col] - mean_df.loc[col]['mean'])

    return agg_df


def create_degree_days_aggregated_factor(ph_df, main_factor, env_type='ph'):
    """Creates features on final_locus_population_id_level"""
    agg_df = pd.DataFrame()
    
    days = ph_df[before_transfer & after_first_feeding].groupby('final_locus_population_id')['event_date'] \
    .agg(['max','min'])
    days['diff'] = (days['max'] - days['min']).dt.days
    degrees = ph_df[before_transfer & after_first_feeding].groupby('final_locus_population_id')[main_factor] \
    .agg(['mean'])
    agg_df['fw_cycle_length'] = days['diff']
    agg_df[f'{env_type}-transfer_feeding-degree_days'] = days['diff'] * degrees['mean']

    days = ph_df[before_transfer & after_vaccination].groupby('final_locus_population_id')['event_date'] \
    .agg(['max','min'])
    days['diff'] = (days['max'] - days['min']).dt.days
    degrees = ph_df[before_transfer & after_vaccination].groupby('final_locus_population_id')[main_factor] \
    .agg(['mean'])
    agg_df[f'{env_type}-transfer_vaccination-degree_days'] = days['diff'] * degrees['mean']

    days = ph_df[before_vaccination & after_first_feeding].groupby('final_locus_population_id')['event_date'] \
    .agg(['max','min'])
    days['diff'] = (days['max'] - days['min']).dt.days
    degrees = ph_df[before_vaccination & after_first_feeding].groupby('final_locus_population_id')[main_factor] \
    .agg(['mean'])
    agg_df[f'{env_type}-vaccination_feeding-degree_days'] = days['diff'] * degrees['mean']

    for week in weeks_before_transfer:
        weeks_before_transfer_date = ph_df['shipout_date'] - pd.to_timedelta(week, unit='W')
        condition = ph_df['event_date'] > weeks_before_transfer_date

        days = ph_df[before_transfer & condition].groupby('final_locus_population_id')['event_date'] \
        .agg(['max','min'])
        days['diff'] = (days['max'] - days['min']).dt.days
        degrees = ph_df[before_transfer & condition].groupby('final_locus_population_id')[main_factor] \
        .agg(['mean'])
        agg_df[f'{env_type}-{week}_weeks_before_transfer-degree_days'] = days['diff'] * degrees['mean']

#     mean_df = pd.concat([agg_df.mean(), agg_df.std()], axis=1).rename(columns={0: 'mean', 1: 'std'})
    
#     for col in agg_df.columns[:]:
#         agg_df[f'{col}-from_mean'] = agg_df[col] - mean_df.loc[col]['mean']
#         agg_df[f'{col}-from_mean-abs'] = np.abs(agg_df[col] - mean_df.loc[col]['mean'])

    return agg_df


def create_ph_aggregated_factors_threshold(ph_df, main_factor, lower_bound, upper_bound, agg_function, condition_type, env_type='ph'):
    """Creates features on final_locus_population_id_level"""
    agg_df = pd.DataFrame()
    
    higher_max = f'{env_type}-{condition_type}-higher_max-{upper_bound}'
    lower_min = f'{env_type}-{condition_type}-lower_min-{lower_bound}'
    btw_min_max = f'{env_type}-{condition_type}-btw_min_max-{lower_bound}-{upper_bound}'
    
    relative_higher_max = f'{env_type}-{condition_type}-relative_higher_max-{upper_bound}'
    relative_lower_min = f'{env_type}-{condition_type}-relative_lower_min-{lower_bound}'
    relative_out_min_max = f'{env_type}-{condition_type}-relative_out_min_max-{lower_bound}-{upper_bound}'


    agg_df[higher_max] = ph_df[ph_df[main_factor] > upper_bound]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x,main_factor,agg_function))

    agg_df[lower_min] = ph_df[ph_df[main_factor] < lower_bound]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x,main_factor,agg_function))

    agg_df[btw_min_max] = ph_df[ph_df[main_factor].between(lower_bound, upper_bound)]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x,main_factor,agg_function))

    agg_df['measurements_count'] = ph_df.groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x,main_factor,agg_function))

    agg_df[higher_max] = agg_df[higher_max].fillna(0).astype('int64')
    agg_df[lower_min] = agg_df[lower_min].fillna(0).astype('int64')
    agg_df[btw_min_max] = agg_df[btw_min_max].fillna(0).astype('int64')

    
    agg_df[relative_higher_max] = agg_df[higher_max] / agg_df['measurements_count']
    agg_df[relative_lower_min] =  agg_df[lower_min] /  agg_df['measurements_count']
    agg_df[relative_out_min_max] = 1 - (agg_df[btw_min_max] /agg_df['measurements_count'])
    agg_df.drop(columns='measurements_count', inplace=True)
        
    return agg_df

def create_factors_df(agg_ph_df, factors, key_columns, weight_column, weighted_func):
    """Creates factors df on key_columns level using weight_column to calculate weighted average"""
    factors_dfs = []
    print('Weighting factors')
    for factor in tqdm(factors):
        tmp = agg_ph_df.groupby(key_columns)\
        .apply(weighted_func, weight_column, factor).reset_index().rename(columns={0: factor})
        factors_dfs.append(tmp)

    factor_df = factors_dfs[0]
    for df in factors_dfs[1:]:
        factor_df = factor_df.merge(df, on=key_columns, how='inner')
    return factor_df

def plot_factors(factors_df, path):
    pairplot = sns.pairplot(factors_df[np.append(corr_df.index, 'mortality')], size=(20, 15))
    plt.title('Mortality vs feature pairplot')
    plt.show()

    for ax in pairplot.axes.flatten():
        # rotate x axis labels
        ax.set_xlabel(ax.get_xlabel(), rotation=45)
        # rotate y axis labels
        ax.set_ylabel(ax.get_ylabel(), rotation=0)
        # set y labels alignment
        ax.xaxis.get_label().set_horizontalalignment('right')
        ax.yaxis.get_label().set_horizontalalignment('right')
    pairplot.figure.savefig(path / 'mortality_vs_feature_pairplot.jpg')

def plot_swarm(df, factor, target, bins, y_scale=1, title=''):
    sns.set_style("whitegrid")

    arr_for_chart = df.copy()
    arr_for_chart[factor+'_binned'] = pd.cut(df[factor], bins=bins)
    arr_for_chart['mortality'] = arr_for_chart['mortality'] * 100
    arr_for_chart['TGC_SW'] = arr_for_chart['TGC_SW']
    x_axis_factor = factor + '_binned'

    plt.figure()
    f, axs = plt.subplots(1, 1, figsize=(18,8), sharex=True)
    box_plot = sns.swarmplot(data=arr_for_chart, x=x_axis_factor, y=target)

    mean_target = arr_for_chart.groupby(x_axis_factor)[target].mean()
    mean_main_factor = arr_for_chart.groupby(x_axis_factor)[main_factor].mean() 

#     vertical_offset = arr_for_chart[y_axis_factor].mean() * 0.00 # offset for display

    for xtick in box_plot.get_xticks():
        box_plot.text(xtick,
                      mean_target.iloc[xtick],
                      round(mean_target.iloc[xtick], 2),
                      horizontalalignment='center',
                      size='xx-large',
                      color='black',
                      weight='semibold')
        box_plot.text(xtick,
                      mean_main_factor.iloc[xtick] / y_scale,
                      round(mean_main_factor.iloc[xtick], 2),
                      horizontalalignment='center',
                      size='xx-large',
                      color='darkblue',
                      weight='semibold')
    
    plt.title(f'Swarmplot {title}: {target} vs. {factor}')
    axs.legend(
        arr_for_chart[x_axis_factor].value_counts().sort_index().apply(lambda x: 'Sample count: ' + str(x)),
        loc='upper right'
    )

    f.show()

#     f.savefig(f'swarmplot_{target}_vs_{factor}.jpg')

In [3]:
dataframes = load_data(locus_weights_path='../data/evt_movement_ratio_with_dates.csv',
                       temperature_path='../data/FW_temperature_cleared.csv',
                       locus_group_matching_path='../data/locus_locus_group_matching.csv',
                       fresh_water_dates_path='../data/FW_cycle_dates.csv',
                       sw_fw_matching_path='../data/seawater_freshwater_matching.csv',
                       sw_fw_matching_path_with_cnt='../data/sw_locus_fw_locus_population_with_counts.csv',
                       final_locus_weighted_path='../data/lw_alldates_final.csv', #_locus
                       mortality_path='../data/smolt_dataset_transfers.csv') #_until2023Feb28_narrow

temperature = dataframes['temperature']
locus_weights = dataframes['locus_weigts']
locus_group_matching = dataframes['locus_group_matching']
final_locus_weighted = dataframes['final_locus_weighted']
fresh_water_dates = dataframes['fresh_water_dates']
sw_fw_matching = dataframes['sw_fw_matching']
mortality = dataframes['mortality']
sw_fw_matching_with_cnt = dataframes['sw_fw_matching_with_cnt']

#tgc = pd.read_csv('../data/transfers_until2023Feb28_with_sw_growth_targets.csv')
vaccines = pd.read_csv('../data/vaccines_with_final_locus_population_id.csv') #_transfers_until_Jun2023
vaccines['VAC_EVENT_DATE'] = pd.to_datetime(vaccines['VAC_EVENT_DATE'] , format=TIME_FORMAT)
vaccines_agg = pd.DataFrame(vaccines.groupby('FINAL_LOCUS_POPULATION_ID')['VAC_EVENT_DATE'].max())
fresh_water_dates.rename(columns={'pretransfer_fw_locus_population_id': 'final_locus_population_id'}, inplace=True)
sw_fw_matching.transport_date = pd.to_datetime(sw_fw_matching.transport_date, format=TIME_FORMAT)
sw_fw_matching_with_cnt.transfer_date = pd.to_datetime(sw_fw_matching_with_cnt['transfer_date'], format=TIME_FORMAT)
sw_fw_matching_with_cnt['transfer_year'] = sw_fw_matching_with_cnt['transfer_date'].dt.year
sw_keys = ['from_locus_id', 'from_fish_group_id']

temperature.event_date = pd.to_datetime(temperature.event_date, format=TIME_FORMAT)
temperature['event_year'] = temperature['event_date'].dt.year

In [4]:
lab = pd.read_csv('../data/evt_lab_atpasa_upd.csv')
lab = lab[lab['fish_group_id'].notna()]
lab['fish_group_id'] = lab['fish_group_id'].astype(int)
lab['sampling_date'] = pd.to_datetime(lab['sampling_date'])

fw_keys = ['locus_id', 'fish_group_id']
factors = ['weight', 'length', 'k_factor', 'atpasa']

lab.head()

Unnamed: 0,locus_id,fish_group_id,sampling_date,weight,length,k_factor,atpasa
0,3046035,273,2020-07-13,111.1,20.51,1.287707,13.86
1,3046035,273,2020-07-20,111.9,20.85,1.234559,14.71
2,3046035,273,2020-07-24,124.1,21.74,1.207795,17.88
3,3046035,35,2020-09-28,114.9,20.79,1.278664,13.19
4,3046035,35,2020-10-05,125.7,21.83,1.208298,16.71


In [6]:
lab['sampling_date'].max()

Timestamp('2024-01-08 00:00:00')

In [7]:
lab = lab.sort_values(by=fw_keys + ['sampling_date']).reset_index(drop=True)
lab.head()

Unnamed: 0,locus_id,fish_group_id,sampling_date,weight,length,k_factor,atpasa
0,3046035,35,2020-09-28,114.9,20.79,1.278664,13.19
1,3046035,35,2020-10-05,125.7,21.83,1.208298,16.71
2,3046035,35,2020-10-13,126.48,21.74,1.230958,19.85
3,3046035,273,2020-07-13,111.1,20.51,1.287707,13.86
4,3046035,273,2020-07-20,111.9,20.85,1.234559,14.71


In [8]:
num_agg = lab.groupby(fw_keys).agg(['min', 'max', 'last'])[factors].reset_index()
cols = list(map('_'.join, num_agg.columns.values[2:]))
for i in range(len(cols)):
    if '<lambda_0>' in cols[i]:
        cols[i] = cols[i].replace('<lambda_0>', 'derivative')
num_agg.columns = fw_keys + cols
        
num_agg.head()

Unnamed: 0,locus_id,fish_group_id,weight_min,weight_max,weight_last,length_min,length_max,length_last,k_factor_min,k_factor_max,k_factor_last,atpasa_min,atpasa_max,atpasa_last
0,3046035,35,114.9,126.48,126.48,20.79,21.83,21.74,1.208298,1.278664,1.230958,13.19,19.85,19.85
1,3046035,273,111.1,124.1,124.1,20.51,21.74,21.74,1.207795,1.287707,1.207795,13.86,17.88,17.88
2,3046035,3792,170.3,199.8,191.0,23.56,25.47,25.35,1.172464,1.305827,1.172464,14.03,21.08,21.08
3,3046035,5673,138.15,177.3,138.15,22.15,24.5,22.89,1.151897,1.278148,1.151897,16.07,20.07,20.07
4,3046035,6149,127.1,170.8,150.6,21.01,23.5,23.03,1.232943,1.374094,1.232943,12.4,18.89,18.89


In [9]:
derrivative = lab.groupby(fw_keys).apply(
    lambda x: (x[factors].diff().divide(x['sampling_date'].diff().dt.days, axis=0)).mean()
).reset_index()

derrivative.rename(columns={col: f'{col}_derrivative' for col in factors}, inplace=True)
derrivative.head()

Unnamed: 0,locus_id,fish_group_id,weight_derrivative,length_derrivative,k_factor_derrivative,atpasa_derrivative
0,3046035,35,0.820179,0.068661,-0.00361,0.447679
1,3046035,273,1.582143,0.135536,-0.007142,0.456964
2,3046035,3792,0.580159,0.068016,-0.006379,0.342698
3,3046035,5673,-1.864286,-0.076667,-0.002558,0.190476
4,3046035,6149,1.915952,0.134619,-0.006563,0.249786


In [10]:
num_agg = num_agg.merge(derrivative, how='inner', on=fw_keys)
num_agg.head()

Unnamed: 0,locus_id,fish_group_id,weight_min,weight_max,weight_last,length_min,length_max,length_last,k_factor_min,k_factor_max,k_factor_last,atpasa_min,atpasa_max,atpasa_last,weight_derrivative,length_derrivative,k_factor_derrivative,atpasa_derrivative
0,3046035,35,114.9,126.48,126.48,20.79,21.83,21.74,1.208298,1.278664,1.230958,13.19,19.85,19.85,0.820179,0.068661,-0.00361,0.447679
1,3046035,273,111.1,124.1,124.1,20.51,21.74,21.74,1.207795,1.287707,1.207795,13.86,17.88,17.88,1.582143,0.135536,-0.007142,0.456964
2,3046035,3792,170.3,199.8,191.0,23.56,25.47,25.35,1.172464,1.305827,1.172464,14.03,21.08,21.08,0.580159,0.068016,-0.006379,0.342698
3,3046035,5673,138.15,177.3,138.15,22.15,24.5,22.89,1.151897,1.278148,1.151897,16.07,20.07,20.07,-1.864286,-0.076667,-0.002558,0.190476
4,3046035,6149,127.1,170.8,150.6,21.01,23.5,23.03,1.232943,1.374094,1.232943,12.4,18.89,18.89,1.915952,0.134619,-0.006563,0.249786


In [11]:
agg_funcs = ['min', 'max', 'last', 'derrivative']
columns = []
for factor in factors:
    for func in agg_funcs:
        columns.append(f'{factor}_{func}')
columns = fw_keys + columns

num_agg.columns = columns
num_agg.head()

Unnamed: 0,locus_id,fish_group_id,weight_min,weight_max,weight_last,weight_derrivative,length_min,length_max,length_last,length_derrivative,k_factor_min,k_factor_max,k_factor_last,k_factor_derrivative,atpasa_min,atpasa_max,atpasa_last,atpasa_derrivative
0,3046035,35,114.9,126.48,126.48,20.79,21.83,21.74,1.208298,1.278664,1.230958,13.19,19.85,19.85,0.820179,0.068661,-0.00361,0.447679
1,3046035,273,111.1,124.1,124.1,20.51,21.74,21.74,1.207795,1.287707,1.207795,13.86,17.88,17.88,1.582143,0.135536,-0.007142,0.456964
2,3046035,3792,170.3,199.8,191.0,23.56,25.47,25.35,1.172464,1.305827,1.172464,14.03,21.08,21.08,0.580159,0.068016,-0.006379,0.342698
3,3046035,5673,138.15,177.3,138.15,22.15,24.5,22.89,1.151897,1.278148,1.151897,16.07,20.07,20.07,-1.864286,-0.076667,-0.002558,0.190476
4,3046035,6149,127.1,170.8,150.6,21.01,23.5,23.03,1.232943,1.374094,1.232943,12.4,18.89,18.89,1.915952,0.134619,-0.006563,0.249786


In [23]:
lab['sampling_date'].max()

Timestamp('2024-01-08 00:00:00')

In [24]:
time_agg = lab.groupby(fw_keys)['sampling_date'].agg(
    ['count', lambda x: (x.max() - x.min()).days // 7, lambda x: np.ceil((x.max() - x.min()).days / 7).astype(int)]
).reset_index()
cols = time_agg.columns.values[2:]
for i in range(len(cols)):
    if '<lambda_0>' in cols[i]:
        cols[i] = cols[i].replace('<lambda_0>', 'span_floor_weeks')
    elif '<lambda_1>' in cols[i]:
        cols[i] = cols[i].replace('<lambda_1>', 'span_ceil_weeks')
    else:
        cols[i] = 'sample_count'
    
time_agg.columns = fw_keys + cols.tolist()
time_agg.head()

Unnamed: 0,locus_id,fish_group_id,sample_count,span_floor_weeks,span_ceil_weeks
0,3046035,35,3,2,3
1,3046035,273,3,1,2
2,3046035,3792,4,2,3
3,3046035,5673,4,3,3
4,3046035,6149,6,4,4


In [13]:
aggregates = np.union1d(time_agg.columns, num_agg.columns)\
[~np.isin(np.union1d(time_agg.columns, num_agg.columns), fw_keys)]

In [25]:
lab_agg = num_agg.merge(time_agg, on=fw_keys, how='inner')
assert lab_agg.shape[0] == time_agg.shape[0] == num_agg.shape[0]
assert lab_agg.shape[1] == time_agg.shape[1] + num_agg.shape[1] - 2
lab_agg.head()

Unnamed: 0,locus_id,fish_group_id,weight_min,weight_max,weight_last,weight_derrivative,length_min,length_max,length_last,length_derrivative,...,k_factor_max,k_factor_last,k_factor_derrivative,atpasa_min,atpasa_max,atpasa_last,atpasa_derrivative,sample_count,span_floor_weeks,span_ceil_weeks
0,3046035,35,114.9,126.48,126.48,20.79,21.83,21.74,1.208298,1.278664,...,13.19,19.85,19.85,0.820179,0.068661,-0.00361,0.447679,3,2,3
1,3046035,273,111.1,124.1,124.1,20.51,21.74,21.74,1.207795,1.287707,...,13.86,17.88,17.88,1.582143,0.135536,-0.007142,0.456964,3,1,2
2,3046035,3792,170.3,199.8,191.0,23.56,25.47,25.35,1.172464,1.305827,...,14.03,21.08,21.08,0.580159,0.068016,-0.006379,0.342698,4,2,3
3,3046035,5673,138.15,177.3,138.15,22.15,24.5,22.89,1.151897,1.278148,...,16.07,20.07,20.07,-1.864286,-0.076667,-0.002558,0.190476,4,3,3
4,3046035,6149,127.1,170.8,150.6,21.01,23.5,23.03,1.232943,1.374094,...,12.4,18.89,18.89,1.915952,0.134619,-0.006563,0.249786,6,4,4


In [26]:
lab_sw = lab_agg.merge(sw_fw_matching_with_cnt,
                   left_on=fw_keys,
                   right_on=sw_keys,
                   how='inner')

In [27]:
sw_fw_matching_with_cnt[sw_fw_matching_with_cnt['to_locus_id'] == 3046036][sw_fw_matching_with_cnt['to_fish_group_id'] == 6165]

Unnamed: 0,to_locus_id,to_fish_group_id,transfer_date,from_locus_population_id,from_count_stocking,from_locus_id,from_year_class,from_avg_weight,from_fish_group_id,transfer_year
6,3046036,6165,2023-01-21,194530774,6349,3046212,2022,149,6150,2023
7,3046036,6165,2023-01-24,194528638,1019,3049138,2022,130,6150,2023
8,3046036,6165,2023-01-24,194531503,8790,3049138,2022,130,6150,2023
9,3046036,6165,2023-01-24,194531495,8895,3049138,2022,130,6150,2023
10,3046036,6165,2023-01-24,194531491,9149,3049138,2022,130,6150,2023
11,3046036,6165,2023-01-24,194531514,9298,3049138,2022,130,6150,2023


In [20]:
agg_lab_sw = lab_sw.groupby(['to_locus_id', 'to_fish_group_id'])\
.apply(lambda x: (x[aggregates].multiply(x['from_count_stocking'], axis=0)).sum() / x['from_count_stocking'].sum()).reset_index()

In [21]:
agg_lab_sw

Unnamed: 0,to_locus_id,to_fish_group_id,atpasa_derrivative,atpasa_last,atpasa_max,atpasa_min,k_factor_derrivative,k_factor_last,k_factor_max,k_factor_min,...,length_last,length_max,length_min,sample_count,span_ceil_weeks,span_floor_weeks,weight_derrivative,weight_last,weight_max,weight_min
0,3046036,3918,0.400476,-0.010646,0.144286,1.533333,20.600000,20.600000,12.190000,1.192109,...,1.192109,22.950000,22.950000,4.000000,3.000000,3.000000,19.920000,144.100000,144.100000,111.900000
1,3046036,6165,0.312225,-0.012121,0.088516,0.267235,20.092745,20.092745,11.755463,1.058527,...,1.058527,23.263353,23.263353,4.854046,3.854046,3.854046,21.135463,133.280060,150.702598,123.182807
2,3046064,5736,0.446784,-0.009010,0.076011,0.409688,20.978383,20.978383,14.723407,1.153579,...,1.153579,21.471727,21.517457,3.000000,2.000000,2.000000,20.407567,114.534533,117.735633,108.189167
3,3046064,6217,0.293042,-0.010022,0.075463,0.243560,19.116392,19.116392,14.064322,1.164406,...,1.164406,24.743404,24.743404,3.401190,2.401190,2.401190,23.359286,176.441780,182.729283,167.374882
4,3046085,4053,0.306190,-0.007289,0.017698,-0.542698,18.650000,18.650000,12.460000,1.182893,...,1.182893,23.340000,23.340000,4.000000,3.000000,2.000000,22.560000,150.400000,159.540000,143.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,7205278,6914,0.352667,-0.011485,0.128029,1.159596,19.658771,19.658771,12.252758,1.160637,...,1.160637,23.750641,23.750641,4.000000,3.000000,3.000000,21.062034,155.470756,155.470756,128.440957
481,7205279,6909,0.291062,-0.005332,0.052857,0.339286,19.469749,19.469749,11.320000,1.115475,...,1.115475,22.020000,22.020000,5.000000,4.000000,4.000000,20.540000,119.100000,122.300000,109.600000
482,7205280,6909,0.475149,-0.010829,0.069579,0.045787,19.929777,19.929777,11.391940,1.136501,...,1.136501,23.701621,23.911063,4.000000,3.000000,2.000000,22.157821,151.322854,160.099469,146.866516
483,7205281,6909,0.288770,-0.007634,0.058486,0.211992,19.242433,19.242433,11.885740,1.100147,...,1.100147,22.165809,22.165809,5.000000,4.000000,4.000000,20.452514,119.799884,121.133526,109.658324


In [44]:
agg_lab_sw.to_csv('../data/lab_factors.csv', index=False)