## Imports and functions

In [40]:
import datetime
from pathlib import Path
from typing import Dict, List, Union
import warnings
warnings.filterwarnings("ignore")

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib.ticker as ticker
from matplotlib.patches import Patch
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

%matplotlib inline
pd.options.mode.chained_assignment = None  # default='warn'
plt.rcParams.update({'figure.max_open_warning': 0})
PARAMS = {'legend.fontsize': 'xx-large',
          'legend.title_fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(PARAMS)

TIME_FORMAT = '%Y-%m-%d'
INT_TYPE = 'int32'

In [41]:
def load_data(locus_weights_path: Union[str, Path],
              temperature_path: Union[str, Path],
              locus_group_matching_path: Union[str, Path],
              fresh_water_dates_path: Union[str, Path],
              sw_fw_matching_path: Union[str, Path],
              sw_fw_matching_path_with_cnt: Union[str, Path],
              final_locus_weighted_path: Union[str, Path],
              mortality_path: Union[str, Path]) -> Dict[str, pd.DataFrame]:
    
    locus_weights = pd.read_csv(locus_weights_path)
    locus_weights.starttime = pd.to_datetime(locus_weights.starttime, format=TIME_FORMAT)
    locus_weights.endtime = pd.to_datetime(locus_weights.endtime, format=TIME_FORMAT)
    
    temperature = pd.read_csv(temperature_path)
#     temperature.event_date = pd.to_datetime(temperature.event_date, format=TIME_FORMAT)
#     temperature.locus_group_id=temperature.locus_group_id.astype(INT_TYPE)
#     temperature['event_year'] = temperature['event_date'].dt.year

    locus_group_matching = pd.read_csv(locus_group_matching_path)
    locus_group_matching = locus_group_matching.astype(INT_TYPE)

    fresh_water_dates = pd.read_csv(fresh_water_dates_path)
    for d in ['first_movement_date', 'first_feeding_date', 'shipout_date']:
        fresh_water_dates[d] = pd.to_datetime(fresh_water_dates[d], format=TIME_FORMAT)

    sw_fw_matching = pd.read_csv(sw_fw_matching_path)
    sw_fw_matching_with_cnt = pd.read_csv(sw_fw_matching_path_with_cnt)

    sw_fw_cols = ['target_seawater_locus_id',
                  'transport_date',
                  'ponding_date',
                  'pretransfer_fw_locus_population_id',
                  'fish_count_shipped_out',
                  'avg_weight_g_stocked']
    sw_fw_matching = sw_fw_matching[sw_fw_matching.origin_site_type == 'Freshwater'][sw_fw_cols]
    sw_fw_matching.pretransfer_fw_locus_population_id = sw_fw_matching.\
    pretransfer_fw_locus_population_id.astype(INT_TYPE)

    final_locus_weighted = pd.read_csv(final_locus_weighted_path)
    final_locus_weighted.event_date = pd.to_datetime(final_locus_weighted.event_date)

    mortality = pd.read_csv(mortality_path)
    mortality['transfer_date'] = pd.to_datetime(mortality['transfer_date'], format=TIME_FORMAT)
    mortality['transport_year']= mortality['transfer_date'].dt.year


    dataframes = {
        'temperature': temperature,
        'locus_weigts': locus_weights,
        'locus_group_matching': locus_group_matching,
        'final_locus_weighted': final_locus_weighted,
        'fresh_water_dates': fresh_water_dates,
        'sw_fw_matching': sw_fw_matching,
        'sw_fw_matching_with_cnt': sw_fw_matching_with_cnt,
        'mortality': mortality
    }
    
    return dataframes

def weighted_avg(x, weight, factor):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        tmp = x[[weight, factor]].dropna()
        weighted_sum = (tmp[weight] * tmp[factor]).sum()
        count_sum = tmp[weight].sum()
        return weighted_sum / count_sum
    
#this is to not calculate aggregates if at least one value on the main factor column is NaN
def apply_condition(group,main_factor,agg_function):
    if group[main_factor].isnull().any():
        return np.nan
    else:
        return agg_function(group[main_factor])


def create_ph_aggregated_factors(ph_df, main_factor, agg_function, function_name, env_type='ph'):
    """Creates features on final_locus_population_id_level"""
    
    after_ff_len = 4 * 7
    
    after_ff_period = ph_df['event_date'].between(
        ph_df['first_feeding_date'],
        ph_df['first_feeding_date'] + pd.to_timedelta(after_ff_len, unit='D')
    )

    before_transfer = ph_df['event_date'] < ph_df['shipout_date']

    after_vaccination = ph_df['event_date'] > ph_df['VAC_EVENT_DATE']
    before_vaccination = ph_df['event_date'] < ph_df['VAC_EVENT_DATE']

    after_first_feeding = ph_df['event_date'] > ph_df['first_feeding_date']

    week_before_vac = ph_df['event_date'] > ph_df['VAC_EVENT_DATE'] - pd.to_timedelta(1, unit='W')
    week_after_vac = ph_df['event_date'] < ph_df['VAC_EVENT_DATE'] + pd.to_timedelta(1, unit='W')

    agg_df = pd.DataFrame()
    
    agg_df[f'{env_type}-whole_period-{function_name}'] = ph_df\
    .groupby('final_locus_population_id').apply(lambda x: apply_condition(x, main_factor, agg_function))
    
#     agg_df[f'{env_type}-after_first_feeding-{function_name}'] = ph_df[after_ff_period] \
#     .groupby('final_locus_population_id').apply(lambda x: apply_condition(x, main_factor, agg_function))
    
#     agg_df[f'{env_type}-transfer_feeding-{function_name}'] = ph_df[before_transfer & after_first_feeding] \
#     .groupby('final_locus_population_id').apply(lambda x: apply_condition(x, main_factor, agg_function))

    agg_df[f'{env_type}-transfer_vaccination-{function_name}'] = ph_df[before_transfer & after_vaccination]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x, main_factor, agg_function))

#     agg_df[f'{env_type}-vaccination-feeding-{function_name}'] = ph_df[before_vaccination & after_first_feeding]\
#     .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x, main_factor, agg_function))

    agg_df[f'{env_type}-around_vaccination-{function_name}'] = ph_df[week_after_vac & week_before_vac]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x, main_factor, agg_function))

    
    mean_df = pd.concat([agg_df.mean(), agg_df.std()], axis=1).rename(columns={0: 'mean', 1: 'std'})

    for col in agg_df.columns[:]:
        agg_df[f'{col}-from_mean'] = agg_df[col] - mean_df.loc[col]['mean']
        agg_df[f'{col}-from_mean-abs'] = np.abs(agg_df[col] - mean_df.loc[col]['mean'])

    return agg_df


def create_degree_days_aggregated_factor(ph_df, main_factor, env_type='ph'):
    """Creates features on final_locus_population_id_level"""
    agg_df = pd.DataFrame()
    
    days = ph_df[before_transfer & after_first_feeding].groupby('final_locus_population_id')['event_date'] \
    .agg(['max','min'])
    days['diff'] = (days['max'] - days['min']).dt.days
    degrees = ph_df[before_transfer & after_first_feeding].groupby('final_locus_population_id')[main_factor] \
    .agg(['mean'])
    agg_df['fw_cycle_length'] = days['diff']
    agg_df[f'{env_type}-transfer_feeding-degree_days'] = days['diff'] * degrees['mean']

    days = ph_df[before_transfer & after_vaccination].groupby('final_locus_population_id')['event_date'] \
    .agg(['max','min'])
    days['diff'] = (days['max'] - days['min']).dt.days
    degrees = ph_df[before_transfer & after_vaccination].groupby('final_locus_population_id')[main_factor] \
    .agg(['mean'])
    agg_df[f'{env_type}-transfer_vaccination-degree_days'] = days['diff'] * degrees['mean']

    days = ph_df[before_vaccination & after_first_feeding].groupby('final_locus_population_id')['event_date'] \
    .agg(['max','min'])
    days['diff'] = (days['max'] - days['min']).dt.days
    degrees = ph_df[before_vaccination & after_first_feeding].groupby('final_locus_population_id')[main_factor] \
    .agg(['mean'])
    agg_df[f'{env_type}-vaccination_feeding-degree_days'] = days['diff'] * degrees['mean']

    for week in weeks_before_transfer:
        weeks_before_transfer_date = ph_df['shipout_date'] - pd.to_timedelta(week, unit='W')
        condition = ph_df['event_date'] > weeks_before_transfer_date

        days = ph_df[before_transfer & condition].groupby('final_locus_population_id')['event_date'] \
        .agg(['max','min'])
        days['diff'] = (days['max'] - days['min']).dt.days
        degrees = ph_df[before_transfer & condition].groupby('final_locus_population_id')[main_factor] \
        .agg(['mean'])
        agg_df[f'{env_type}-{week}_weeks_before_transfer-degree_days'] = days['diff'] * degrees['mean']

#     mean_df = pd.concat([agg_df.mean(), agg_df.std()], axis=1).rename(columns={0: 'mean', 1: 'std'})
    
#     for col in agg_df.columns[:]:
#         agg_df[f'{col}-from_mean'] = agg_df[col] - mean_df.loc[col]['mean']
#         agg_df[f'{col}-from_mean-abs'] = np.abs(agg_df[col] - mean_df.loc[col]['mean'])

    return agg_df


def create_ph_aggregated_factors_threshold(ph_df, main_factor, lower_bound, upper_bound, agg_function, condition_type, env_type='ph'):
    """Creates features on final_locus_population_id_level"""
    agg_df = pd.DataFrame()
    
    higher_max = f'{env_type}-{condition_type}-higher_max-{upper_bound}'
    lower_min = f'{env_type}-{condition_type}-lower_min-{lower_bound}'
    btw_min_max = f'{env_type}-{condition_type}-btw_min_max-{lower_bound}-{upper_bound}'
    
    relative_higher_max = f'{env_type}-{condition_type}-relative_higher_max-{upper_bound}'
    relative_lower_min = f'{env_type}-{condition_type}-relative_lower_min-{lower_bound}'
    relative_out_min_max = f'{env_type}-{condition_type}-relative_out_min_max-{lower_bound}-{upper_bound}'


    agg_df[higher_max] = ph_df[ph_df[main_factor] > upper_bound]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x,main_factor,agg_function))

    agg_df[lower_min] = ph_df[ph_df[main_factor] < lower_bound]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x,main_factor,agg_function))

    agg_df[btw_min_max] = ph_df[ph_df[main_factor].between(lower_bound, upper_bound)]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x,main_factor,agg_function))

    agg_df['measurements_count'] = ph_df.groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x,main_factor,agg_function))

    agg_df[higher_max] = agg_df[higher_max].fillna(0).astype('int64')
    agg_df[lower_min] = agg_df[lower_min].fillna(0).astype('int64')
    agg_df[btw_min_max] = agg_df[btw_min_max].fillna(0).astype('int64')

    
    agg_df[relative_higher_max] = agg_df[higher_max] / agg_df['measurements_count']
    agg_df[relative_lower_min] =  agg_df[lower_min] /  agg_df['measurements_count']
    agg_df[relative_out_min_max] = 1 - (agg_df[btw_min_max] /agg_df['measurements_count'])
    agg_df.drop(columns='measurements_count', inplace=True)
        
    return agg_df

def create_factors_df(agg_ph_df, factors, key_columns, weight_column, weighted_func):
    """Creates factors df on key_columns level using weight_column to calculate weighted average"""
    factors_dfs = []
    print('Weighting factors')
    for factor in tqdm(factors):
        tmp = agg_ph_df.groupby(key_columns)\
        .apply(weighted_func, weight_column, factor).reset_index().rename(columns={0: factor})
        factors_dfs.append(tmp)

    factor_df = factors_dfs[0]
    for df in factors_dfs[1:]:
        factor_df = factor_df.merge(df, on=key_columns, how='inner')
    return factor_df

def plot_factors(factors_df, path):
    pairplot = sns.pairplot(factors_df[np.append(corr_df.index, 'mortality')], size=(20, 15))
    plt.title('Mortality vs feature pairplot')
    plt.show()

    for ax in pairplot.axes.flatten():
        # rotate x axis labels
        ax.set_xlabel(ax.get_xlabel(), rotation=45)
        # rotate y axis labels
        ax.set_ylabel(ax.get_ylabel(), rotation=0)
        # set y labels alignment
        ax.xaxis.get_label().set_horizontalalignment('right')
        ax.yaxis.get_label().set_horizontalalignment('right')
    pairplot.figure.savefig(path / 'mortality_vs_feature_pairplot.jpg')

def plot_swarm(df, factor, target, bins, y_scale=1, title=''):
    sns.set_style("whitegrid")

    arr_for_chart = df.copy()
    arr_for_chart[factor+'_binned'] = pd.cut(df[factor], bins=bins)
    arr_for_chart['mortality'] = arr_for_chart['mortality'] * 100
    arr_for_chart['TGC_SW'] = arr_for_chart['TGC_SW']
    x_axis_factor = factor + '_binned'

    plt.figure()
    f, axs = plt.subplots(1, 1, figsize=(18,8), sharex=True)
    box_plot = sns.swarmplot(data=arr_for_chart, x=x_axis_factor, y=target)

    mean_target = arr_for_chart.groupby(x_axis_factor)[target].mean()
    mean_main_factor = arr_for_chart.groupby(x_axis_factor)[main_factor].mean() 

#     vertical_offset = arr_for_chart[y_axis_factor].mean() * 0.00 # offset for display

    for xtick in box_plot.get_xticks():
        box_plot.text(xtick,
                      mean_target.iloc[xtick],
                      round(mean_target.iloc[xtick], 2),
                      horizontalalignment='center',
                      size='xx-large',
                      color='black',
                      weight='semibold')
        box_plot.text(xtick,
                      mean_main_factor.iloc[xtick] / y_scale,
                      round(mean_main_factor.iloc[xtick], 2),
                      horizontalalignment='center',
                      size='xx-large',
                      color='darkblue',
                      weight='semibold')
    
    plt.title(f'Swarmplot {title}: {target} vs. {factor}')
    axs.legend(
        arr_for_chart[x_axis_factor].value_counts().sort_index().apply(lambda x: 'Sample count: ' + str(x)),
        loc='upper right'
    )

    f.show()

#     f.savefig(f'swarmplot_{target}_vs_{factor}.jpg')

## Data processing

In [42]:
dataframes = load_data(locus_weights_path='../data/evt_movement_ratio_with_dates.csv',
                       temperature_path='../data/FW_temperature_cleared.csv',
                       locus_group_matching_path='../data/locus_locus_group_matching.csv',
                       fresh_water_dates_path='../data/FW_cycle_dates.csv',
                       sw_fw_matching_path='../data/seawater_freshwater_matching.csv',
                       sw_fw_matching_path_with_cnt='../data/sw_locus_fw_locus_population_with_counts.csv',
                       final_locus_weighted_path='../data/lw_alldates_final.csv', #_locus
                       mortality_path='../data/smolt_dataset_transfers.csv') #_until2023Feb28_narrow

temperature = dataframes['temperature']
locus_weights = dataframes['locus_weigts']
locus_group_matching = dataframes['locus_group_matching']
final_locus_weighted = dataframes['final_locus_weighted']
fresh_water_dates = dataframes['fresh_water_dates']
sw_fw_matching = dataframes['sw_fw_matching']
mortality = dataframes['mortality']
sw_fw_matching_with_cnt = dataframes['sw_fw_matching_with_cnt']

#tgc = pd.read_csv('data/transfers_until2023Feb28_with_sw_growth_targets.csv')
vaccines = pd.read_csv('../data/vaccines_with_final_locus_population_id.csv') #_transfers_until_Jun2023
vaccines['VAC_EVENT_DATE'] = pd.to_datetime(vaccines['VAC_EVENT_DATE'] , format=TIME_FORMAT)
vaccines_agg = pd.DataFrame(vaccines.groupby('FINAL_LOCUS_POPULATION_ID')['VAC_EVENT_DATE'].max())
fresh_water_dates.rename(columns={'pretransfer_fw_locus_population_id': 'final_locus_population_id'}, inplace=True)
sw_fw_matching.transport_date = pd.to_datetime(sw_fw_matching.transport_date, format=TIME_FORMAT)
sw_fw_matching_with_cnt.transfer_date = pd.to_datetime(sw_fw_matching_with_cnt['transfer_date'], format=TIME_FORMAT)
sw_fw_matching_with_cnt['transfer_year'] = sw_fw_matching_with_cnt['transfer_date'].dt.year

temperature.event_date = pd.to_datetime(temperature.event_date, format=TIME_FORMAT)
temperature['event_year'] = temperature['event_date'].dt.year

In [43]:
photoperiod = pd.read_csv('../data\\photoperiod_final_lp_UPS.csv')

In [45]:
photoperiod['event_date'].max()

'2023-12-10'

In [46]:
tmp_UPS_length=photoperiod.groupby('final_locus_population_id')['event_date'].agg({min,max}).reset_index()
tmp_UPS_length['max']=pd.to_datetime(tmp_UPS_length['max'])
tmp_UPS_length['min']=pd.to_datetime(tmp_UPS_length['min'])
tmp_UPS_length['UPS_length'] = np.where(tmp_UPS_length['min']<'2020-07-01',np.nan,(tmp_UPS_length['max']-tmp_UPS_length['min']).dt.days)
tmp_UPS_length['UPS_length2'] = (tmp_UPS_length['max']-tmp_UPS_length['min']).dt.days

# tmp_UPS_length[tmp_UPS_length.UPS_length==1]#.describe()
# tmp_UPS_length.rename(columns=[''],inplace=True)

In [47]:
tmp_UPS_length.describe()

Unnamed: 0,final_locus_population_id,max,min,UPS_length,UPS_length2
count,4512.0,4512,4512,3116.0,4512.0
mean,166828700.0,2021-10-20 20:59:21.702127616,2020-11-03 04:44:02.553191424,359.207959,351.677305
min,36770650.0,2016-02-21 00:00:00,2016-01-01 00:00:00,202.0,47.0
25%,194269100.0,2020-11-22 00:00:00,2019-12-17 00:00:00,341.0,334.0
50%,194506900.0,2022-05-14 00:00:00,2021-07-01 00:00:00,358.0,354.0
75%,194547500.0,2023-02-19 00:00:00,2022-02-16 00:00:00,382.0,376.0
max,194761300.0,2023-12-10 00:00:00,2023-01-30 00:00:00,489.0,489.0
std,59834530.0,,,30.326057,37.590827


In [48]:
photoperiod=photoperiod.merge(tmp_UPS_length[['final_locus_population_id','UPS_length','UPS_length2']],how='left')

photoperiod = photoperiod.merge(
    fresh_water_dates,
    how='inner',
)

photoperiod = photoperiod.merge(
    vaccines_agg,
    left_on='final_locus_population_id',
    right_on='FINAL_LOCUS_POPULATION_ID',
    how='left'
)

photoperiod['shipout_year'] = photoperiod['shipout_date'].dt.year

In [49]:
photoperiod['event_date'].max()

'2023-12-10'

In [50]:
mortality_cols = ['locus_id',
                  'fish_group_id',
                  'transfer_year',
                  'transfer_month',
                  'transfer_month_year',
                  'transfer_season',
                  'transfer_season2',
                  'total_count',
                  'total_mortality_perc_90']

sw_cols = ['to_locus_id',
           'to_fish_group_id',
           'transfer_date',
           'from_locus_population_id',
           'from_count_stocking',
           'transfer_year']

mortality_final_locus = mortality[mortality_cols].merge(
    sw_fw_matching_with_cnt[sw_cols],
    left_on=['fish_group_id', 'locus_id','transfer_year'],
    right_on=['to_fish_group_id', 'to_locus_id','transfer_year'],
    how='left'
)

mortality_final_locus['from_locus_population_id'] = mortality_final_locus['from_locus_population_id'].fillna(0).astype('int32')

In [51]:
# six_weeks_condition = fw_mortality['event_date'] > (fw_mortality['shipout_date'] - pd.to_timedelta(6, unit='W'))
# eight_weeks_condition = fw_mortality['event_date'] > (fw_mortality['shipout_date'] - pd.to_timedelta(8, unit='W'))
# threshold_conditions = {
#     '6_weeks_before_transfer': before_transfer & six_weeks_condition,
#     '8_weeks_before_transfer': before_transfer & eight_weeks_condition,
#     'vaccination_transfer': before_transfer & after_vaccination,
#     'first_feeding_transfer': before_transfer & after_first_feeding,
# }

# weeks_before_transfer = [2, 4, 6, 8]
#name of column with values of the factor
key_columns = ['locus_id','fish_group_id','transfer_year']

## Factors aggregation

In [52]:
light_cols = [col for col in photoperiod.columns if 'light' in col]

In [53]:
agg_functions = {
#     'mean': np.mean,
#     'min': np.min,
#     'max': np.max,
#     'std': np.std,
#     'cv': lambda x: np.std(x) / np.mean(x),  # coefficient of variation
# #     'threshold': np.count_nonzero,
#     'median': np.median,
    'sum': np.sum
}
agg_functions2 = {
    'mean': np.mean,
#     'min': np.min,
#     'max': np.max,
#     'std': np.std,
#     'cv': lambda x: np.std(x) / np.mean(x),  # coefficient of variation
# #     'threshold': np.count_nonzero,
#     'median': np.median,
#     'sum': np.sum
}

In [54]:
light_cols

['light_1', 'light_2', 'light_3']

In [26]:
for main_factor in tqdm(light_cols):
    agg_dfs = []

    for name, func in agg_functions.items():
        if name != 'threshold':
            agg_df = create_ph_aggregated_factors(photoperiod,
                                                  main_factor,
                                                  agg_function=func,
                                                  function_name=name,
                                                  env_type=main_factor.lower())
            agg_dfs.append(agg_df)
        else:
            for condition_type, condition in threshold_conditions.items():
                agg_df = create_ph_aggregated_factors_threshold(
                    fw_mortality[condition],
                    main_factor,
                    min_threshold,
                    max_threshold,
                    agg_function=func,
                    condition_type=condition_type,
                    env_type=main_factor.lower()
                )
                agg_dfs.append(agg_df)

    # agg_dfs.append(create_degree_days_aggregated_factor(temperature, main_factor, env_type=main_factor.lower()))

    agg_df = pd.concat(agg_dfs, axis=1)

    agg_df = agg_df.merge(
        mortality_final_locus[['from_locus_population_id', 'locus_id','fish_group_id', 'transfer_year', 'from_count_stocking']],
        left_on=['final_locus_population_id',],
        right_on=['from_locus_population_id',],
        how='right')

    factors = agg_df.columns.difference(
        ['from_locus_population_id', 'locus_id', 'fish_group_id', 'transfer_year', 'from_count_stocking']
    )
    
    factors_photoperiod = create_factors_df(agg_df,
                                            factors,
                                            key_columns, 
                                            weight_column='from_count_stocking',
                                            weighted_func=weighted_avg)
    factors_photoperiod = factors_photoperiod.merge(
        pd.DataFrame(mortality_final_locus.groupby(key_columns)['total_mortality_perc_90'].mean()),
        on=key_columns,
        how='inner')

    # factors_photoperiod = factors_photoperiod.merge(
    #     pd.DataFrame(tgc.groupby(key_columns)['TGC_SW'].mean()),
    #     on=key_columns,
    #     how='inner'
    # )
    factors_photoperiod.rename(columns={'total_mortality_perc_90': 'mortality'}, inplace=True)
    factors_photoperiod.to_csv(f'../data\\factors_{main_factor.lower()}_UPS.csv', index=False)
    

  0%|          | 0/3 [00:00<?, ?it/s]

Weighting factors



  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [00:02<00:22,  2.85s/it][A
 22%|██▏       | 2/9 [00:05<00:19,  2.84s/it][A
 33%|███▎      | 3/9 [00:08<00:17,  2.86s/it][A
 44%|████▍     | 4/9 [00:11<00:13,  2.79s/it][A
 56%|█████▌    | 5/9 [00:13<00:10,  2.74s/it][A
 67%|██████▋   | 6/9 [00:16<00:08,  2.71s/it][A
 78%|███████▊  | 7/9 [00:19<00:05,  2.78s/it][A
 89%|████████▉ | 8/9 [00:22<00:02,  2.80s/it][A
100%|██████████| 9/9 [00:25<00:00,  2.80s/it][A
 33%|███▎      | 1/3 [00:41<01:23, 41.60s/it]

Weighting factors



  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [00:03<00:26,  3.25s/it][A
 22%|██▏       | 2/9 [00:06<00:21,  3.06s/it][A
 33%|███▎      | 3/9 [00:09<00:17,  2.96s/it][A
 44%|████▍     | 4/9 [00:11<00:14,  2.85s/it][A
 56%|█████▌    | 5/9 [00:14<00:11,  2.82s/it][A
 67%|██████▋   | 6/9 [00:17<00:08,  2.78s/it][A
 78%|███████▊  | 7/9 [00:20<00:05,  2.80s/it][A
 89%|████████▉ | 8/9 [00:22<00:02,  2.80s/it][A
100%|██████████| 9/9 [00:25<00:00,  2.85s/it][A
 67%|██████▋   | 2/3 [01:24<00:42, 42.35s/it]

Weighting factors



  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [00:02<00:22,  2.84s/it][A
 22%|██▏       | 2/9 [00:05<00:20,  2.91s/it][A
 33%|███▎      | 3/9 [00:08<00:17,  2.92s/it][A
 44%|████▍     | 4/9 [00:11<00:14,  2.84s/it][A
 56%|█████▌    | 5/9 [00:14<00:11,  2.80s/it][A
 67%|██████▋   | 6/9 [00:16<00:08,  2.76s/it][A
 78%|███████▊  | 7/9 [00:19<00:05,  2.80s/it][A
 89%|████████▉ | 8/9 [00:22<00:02,  2.82s/it][A
100%|██████████| 9/9 [00:25<00:00,  2.84s/it][A
100%|██████████| 3/3 [02:06<00:00, 42.18s/it]


In [27]:
for main_factor in tqdm(['UPS_length','UPS_length2']):
    agg_dfs = []

    for name, func in agg_functions2.items():
        if name != 'threshold':
            agg_df = create_ph_aggregated_factors(photoperiod,
                                                  main_factor,
                                                  agg_function=func,
                                                  function_name=name,
                                                  env_type=main_factor.lower())
            agg_dfs.append(agg_df)
        else:
            for condition_type, condition in threshold_conditions.items():
                agg_df = create_ph_aggregated_factors_threshold(
                    fw_mortality[condition],
                    main_factor,
                    min_threshold,
                    max_threshold,
                    agg_function=func,
                    condition_type=condition_type,
                    env_type=main_factor.lower()
                )
                agg_dfs.append(agg_df)

    # agg_dfs.append(create_degree_days_aggregated_factor(temperature, main_factor, env_type=main_factor.lower()))

    agg_df = pd.concat(agg_dfs, axis=1)

    agg_df = agg_df.merge(
        mortality_final_locus[['from_locus_population_id', 'locus_id','fish_group_id', 'transfer_year', 'from_count_stocking']],
        left_on=['final_locus_population_id',],
        right_on=['from_locus_population_id',],
        how='right')

    factors = agg_df.columns.difference(
        ['from_locus_population_id', 'locus_id', 'fish_group_id', 'transfer_year', 'from_count_stocking']
    )

    factors_photoperiod2 = create_factors_df(agg_df,
                                            factors,
                                            key_columns, 
                                            weight_column='from_count_stocking',
                                            weighted_func=weighted_avg)
    factors_photoperiod2 = factors_photoperiod2.merge(
        pd.DataFrame(mortality_final_locus.groupby(key_columns)['total_mortality_perc_90'].mean()),
        on=key_columns,
        how='inner')
    
    factors_photoperiod2.rename(columns={'total_mortality_perc_90': 'mortality'}, inplace=True)

    display(factors_photoperiod2.isna().sum())
    factors_photoperiod2.to_csv(f'../data\\factors_{main_factor.lower()}_UPS.csv', index=False)
    # factors_photoperiod2 = factors_photoperiod2.merge(
    #     pd.DataFrame(tgc.groupby(key_columns)['TGC_SW'].mean()),
    #     on=key_columns,
    #     how='inner'
    # )


  0%|          | 0/2 [00:00<?, ?it/s]

Weighting factors



  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [00:03<00:26,  3.28s/it][A
 22%|██▏       | 2/9 [00:06<00:22,  3.19s/it][A
 33%|███▎      | 3/9 [00:09<00:18,  3.08s/it][A
 44%|████▍     | 4/9 [00:12<00:14,  3.00s/it][A
 56%|█████▌    | 5/9 [00:15<00:11,  2.97s/it][A
 67%|██████▋   | 6/9 [00:18<00:08,  2.98s/it][A
 78%|███████▊  | 7/9 [00:21<00:06,  3.05s/it][A
 89%|████████▉ | 8/9 [00:24<00:03,  3.01s/it][A
100%|██████████| 9/9 [00:27<00:00,  3.02s/it][A


locus_id                                                 0
fish_group_id                                            0
transfer_year                                            0
ups_length-around_vaccination-mean                    1104
ups_length-around_vaccination-mean-from_mean          1104
ups_length-around_vaccination-mean-from_mean-abs      1104
ups_length-transfer_vaccination-mean                  1104
ups_length-transfer_vaccination-mean-from_mean        1104
ups_length-transfer_vaccination-mean-from_mean-abs    1104
ups_length-whole_period-mean                          1104
ups_length-whole_period-mean-from_mean                1104
ups_length-whole_period-mean-from_mean-abs            1104
mortality                                                0
dtype: int64

 50%|█████     | 1/2 [00:44<00:44, 44.74s/it]

Weighting factors



  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [00:02<00:20,  2.62s/it][A
 22%|██▏       | 2/9 [00:05<00:18,  2.62s/it][A
 33%|███▎      | 3/9 [00:07<00:16,  2.67s/it][A
 44%|████▍     | 4/9 [00:10<00:13,  2.66s/it][A
 56%|█████▌    | 5/9 [00:13<00:10,  2.69s/it][A
 67%|██████▋   | 6/9 [00:16<00:08,  2.74s/it][A
 78%|███████▊  | 7/9 [00:18<00:05,  2.69s/it][A
 89%|████████▉ | 8/9 [00:21<00:02,  2.68s/it][A
100%|██████████| 9/9 [00:24<00:00,  2.68s/it][A


locus_id                                                 0
fish_group_id                                            0
transfer_year                                            0
ups_length2-around_vaccination-mean                    330
ups_length2-around_vaccination-mean-from_mean          330
ups_length2-around_vaccination-mean-from_mean-abs      330
ups_length2-transfer_vaccination-mean                  391
ups_length2-transfer_vaccination-mean-from_mean        391
ups_length2-transfer_vaccination-mean-from_mean-abs    391
ups_length2-whole_period-mean                          177
ups_length2-whole_period-mean-from_mean                177
ups_length2-whole_period-mean-from_mean-abs            177
mortality                                                0
dtype: int64

100%|██████████| 2/2 [01:27<00:00, 43.66s/it]


In [None]:
# factors_photoperiod2[key_columns+['ups_length-whole_period-mean']].describe()#.groupby('transfer_year').agg(lambda x:x.isna().sum())
# # factors_photoperiod.merge(factors_photoperiod2[key_columns+['ups_length-whole_period-mean']])

In [None]:
WOOORK HEEEEERE

In [55]:
cycles = pd.read_csv('../data/factors_cycle.csv')
cycles['fw_cycle_length_first_feeding_vac'] = cycles['fw_cycle_length_first_feeding_shipout'] - cycles['fw_cycle_length_vac_shipout']

In [56]:
cycles

Unnamed: 0,locus_id,fish_group_id,transfer_year,fw_cycle_length_first_mvmt_shipout,fw_cycle_length_first_feeding_shipout,fw_cycle_length_vac_shipout,ups_length,fw_cycle_length_first_feeding_vac
0,3046036,11,2017,433.00,252.00,,174.000000,
1,3046036,1052,2019,384.00,238.00,56.00,122.000000,182.00
2,3046036,3918,2020,434.00,255.00,49.00,92.000000,206.00
3,3046036,6165,2023,446.00,287.00,59.63,140.723977,227.37
4,3046043,310,2017,432.00,262.00,,184.000000,
...,...,...,...,...,...,...,...,...
1257,7183889,6071,2022,429.00,308.00,77.00,171.000000,231.00
1258,7183890,6072,2022,414.00,293.00,65.54,143.131365,227.46
1259,7183891,6072,2022,414.00,293.00,64.00,127.000000,229.00
1260,7183892,6069,2022,408.57,286.94,71.20,142.487400,215.74


In [30]:
normalized_factors = []
non_normalized_factors = []
factors_photoperiod2 =  pd.read_csv(f'../data\\factors_ups_length_UPS.csv')

for main_factor in light_cols:
    factors_photoperiod = pd.read_csv(f'../data\\factors_{main_factor.lower()}_UPS.csv')
    factors_photoperiod = factors_photoperiod.merge(factors_photoperiod2[key_columns+['ups_length-whole_period-mean']])
    non_normalized_factors.append(factors_photoperiod)
    factors_photoperiod_normalized = factors_photoperiod.copy()
    factors_photoperiod_normalized = factors_photoperiod_normalized.merge(
        cycles, how='inner', on=key_columns
    )
    
    for col in factors_photoperiod_normalized.columns:
        if 'after_first_feeding' in col:
            factors_photoperiod_normalized[col] = factors_photoperiod_normalized[col] / (4 * 7 + 1)
        elif 'around_vaccination' in col:
            factors_photoperiod_normalized[col] = np.where(factors_photoperiod_normalized['ups_length-whole_period-mean'].isna(),np.nan,factors_photoperiod_normalized[col] / (13))
        elif 'transfer_feeding' in col:
            factors_photoperiod_normalized[col] /= factors_photoperiod_normalized[
                'fw_cycle_length_first_feeding_shipout'
            ]
        elif 'transfer_vaccination' in col:
#             factors_photoperiod_normalized[col] /= factors_photoperiod_normalized[
#                 'fw_cycle_length_vac_shipout'
#             ]
#this not exactly correct - induces nans for nan whole period
            factors_photoperiod_normalized[col] /= np.where(factors_photoperiod_normalized['ups_length-whole_period-mean'].isna(),np.nan,factors_photoperiod_normalized[
                'fw_cycle_length_vac_shipout'
            ])
        elif 'vaccination-feeding' in col:
            factors_photoperiod_normalized[col] /= factors_photoperiod_normalized[
                'fw_cycle_length_vac_shipout'
            ]
        elif 'vaccination-feeding' in col:
            factors_photoperiod_normalized[col] /= factors_photoperiod_normalized[
                'fw_cycle_length_first_feeding_vac'
            ]
        elif 'whole_period' in col:
            factors_photoperiod_normalized[col] /= factors_photoperiod_normalized[
                'ups_length-whole_period-mean'
#                 'fw_cycle_length_first_mvmt_shipout'

            ]
        else:
            pass
    factors_photoperiod_normalized.to_csv(f'../data\\factors_{main_factor.lower()}_normalized_UPS.csv', index=False)
    normalized_factors.append(factors_photoperiod_normalized)

In [31]:
factors_photoperiod_normalized_merged = pd.concat(normalized_factors, axis=1)
factors_photoperiod_non_normalized_merged = pd.concat(non_normalized_factors, axis=1)

In [34]:
factors_photoperiod_normalized_merged = pd.concat(normalized_factors, axis=1)
factors_photoperiod_normalized_merged.drop(
    ['fw_cycle_length_first_mvmt_shipout',
     'fw_cycle_length_first_feeding_shipout',
     'fw_cycle_length_vac_shipout',
     'fw_cycle_length_first_feeding_vac',
     #'TGC_SW',
     'mortality'
    ], axis=1, inplace=True
)

factors_photoperiod_normalized_merged = factors_photoperiod_normalized_merged.loc[
    :,~factors_photoperiod_normalized_merged.columns.duplicated()
]

for col in factors_photoperiod_normalized_merged:
    if col not in key_columns:
        factors_photoperiod_normalized_merged.rename(columns={col: f'{col}_normalized'}, inplace=True)

factors_photoperiod_normalized_merged

Unnamed: 0,locus_id,fish_group_id,transfer_year,light_1-around_vaccination-sum_normalized,light_1-around_vaccination-sum-from_mean_normalized,light_1-around_vaccination-sum-from_mean-abs_normalized,light_1-transfer_vaccination-sum_normalized,light_1-transfer_vaccination-sum-from_mean_normalized,light_1-transfer_vaccination-sum-from_mean-abs_normalized,light_1-whole_period-sum_normalized,...,light_2-whole_period-sum-from_mean-abs_normalized,light_3-around_vaccination-sum_normalized,light_3-around_vaccination-sum-from_mean_normalized,light_3-around_vaccination-sum-from_mean-abs_normalized,light_3-transfer_vaccination-sum_normalized,light_3-transfer_vaccination-sum-from_mean_normalized,light_3-transfer_vaccination-sum-from_mean-abs_normalized,light_3-whole_period-sum_normalized,light_3-whole_period-sum-from_mean_normalized,light_3-whole_period-sum-from_mean-abs_normalized
0,3046036,11,2017,,,,,,,,...,,,,,,,,,,
1,3046036,1052,2019,,,,,,,,...,,,,,,,,,,
2,3046036,3918,2020,,,,,,,,...,,,,,,,,,,
3,3046036,6165,2023,,,,0.422427,-0.202192,0.205883,,...,,,,,0.534195,0.046480,0.046480,,,
4,3046043,310,2017,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1253,7183889,6071,2022,,,,0.441558,-0.042156,0.042156,,...,,,,,0.545455,0.167761,0.167761,,,
1254,7183890,6072,2022,,,,0.508155,-0.060139,0.060139,,...,,,,,0.476531,0.032795,0.032795,,,
1255,7183891,6072,2022,,,,0.484375,-0.097594,0.097594,,...,,,,,0.500000,0.045587,0.045587,,,
1256,7183892,6069,2022,,,,0.536498,0.013380,0.057012,,...,,,,,0.449438,0.040977,0.040977,,,


In [35]:
factors_photoperiod_non_normalized_merged = factors_photoperiod_non_normalized_merged.loc[
    :,~factors_photoperiod_non_normalized_merged.columns.duplicated()
]

factors_photoperiod_non_normalized_merged.drop(
    [#'TGC_SW',
     'mortality'], axis=1, inplace=True
)

factors_photoperiod_non_normalized_merged

Unnamed: 0,locus_id,fish_group_id,transfer_year,light_1-around_vaccination-sum,light_1-around_vaccination-sum-from_mean,light_1-around_vaccination-sum-from_mean-abs,light_1-transfer_vaccination-sum,light_1-transfer_vaccination-sum-from_mean,light_1-transfer_vaccination-sum-from_mean-abs,light_1-whole_period-sum,...,light_2-whole_period-sum-from_mean-abs,light_3-around_vaccination-sum,light_3-around_vaccination-sum-from_mean,light_3-around_vaccination-sum-from_mean-abs,light_3-transfer_vaccination-sum,light_3-transfer_vaccination-sum-from_mean,light_3-transfer_vaccination-sum-from_mean-abs,light_3-whole_period-sum,light_3-whole_period-sum-from_mean,light_3-whole_period-sum-from_mean-abs
0,3046036,11,2017,,,,,,,,...,,,,,,,,,,
1,3046036,1052,2019,,,,0.000000,-37.246006,37.246006,,...,,,,,0.000000,-29.082428,29.082428,,,
2,3046036,3918,2020,,,,0.000000,-37.246006,37.246006,,...,,,,,36.000000,6.917572,6.917572,,,
3,3046036,6165,2023,,,,25.189310,-12.056696,12.276793,,...,,,,,31.854046,2.771618,2.771618,,,
4,3046043,310,2017,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1402,7205278,6914,2023,,,,32.142065,-5.103941,5.103941,,...,,,,,31.214484,2.132056,2.132056,,,
1403,7205279,6909,2023,,,,37.000000,-0.246006,0.246006,,...,,,,,34.000000,4.917572,4.917572,,,
1404,7205280,6909,2023,,,,36.976083,-0.269923,0.269923,,...,,,,,28.994685,-0.087743,0.087743,,,
1405,7205281,6909,2023,,,,38.166474,0.920467,1.125520,,...,,,,,35.166474,6.084045,6.084045,,,


In [36]:
factors_photoperiod_UPS = factors_photoperiod_non_normalized_merged.merge(
    factors_photoperiod_normalized_merged,
    how='inner',
    on=key_columns
)

In [37]:
column_list_vac_transfer = [x for x in factors_photoperiod_UPS.columns if x not in key_columns+['mortality','TGC_SW','fw_cycle_length'] and x.endswith("-transfer_vaccination-sum")]
column_list_around_vac = [x for x in factors_photoperiod_UPS.columns if x not in key_columns+['mortality','TGC_SW','fw_cycle_length'] and x.endswith("-around_vaccination-sum")]
column_list_wholeper = [x for x in factors_photoperiod_UPS.columns if x not in key_columns+['mortality','TGC_SW','fw_cycle_length'] and x.endswith("-whole_period-sum")]
column_list_vac_transfer_norm = [x for x in factors_photoperiod_UPS.columns if x not in key_columns+['mortality','TGC_SW','fw_cycle_length'] and x.endswith("-transfer_vaccination-sum_normalized")]
column_list_around_vac_norm = [x for x in factors_photoperiod_UPS.columns if x not in key_columns+['mortality','TGC_SW','fw_cycle_length'] and x.endswith("-around_vaccination-sum_normalized")]
column_list_wholeper_norm = [x for x in factors_photoperiod_UPS.columns if x not in key_columns+['mortality','TGC_SW','fw_cycle_length'] and x.endswith("-whole_period-sum_normalized")]
factors_column_shortlisted = column_list_vac_transfer+column_list_around_vac+column_list_wholeper+column_list_vac_transfer_norm+column_list_around_vac_norm+column_list_wholeper_norm
factors_column_shortlisted2 =column_list_vac_transfer_norm+column_list_around_vac_norm+column_list_wholeper_norm

In [38]:
factors_photoperiod_UPS[key_columns+factors_column_shortlisted2+['ups_length-whole_period-mean']].describe()

Unnamed: 0,locus_id,fish_group_id,transfer_year,light_1-transfer_vaccination-sum_normalized,light_2-transfer_vaccination-sum_normalized,light_3-transfer_vaccination-sum_normalized,light_1-around_vaccination-sum_normalized,light_2-around_vaccination-sum_normalized,light_3-around_vaccination-sum_normalized,light_1-whole_period-sum_normalized,light_2-whole_period-sum_normalized,light_3-whole_period-sum_normalized,ups_length-whole_period-mean
count,1258.0,1258.0,1258.0,176.0,176.0,176.0,1.0,1.0,1.0,0.0,0.0,0.0,196.0
mean,3268427.0,2136.635135,2019.282194,0.55143,0.009357,0.422638,0.846154,0.0,0.153846,,,,360.494644
std,833232.4,2233.166819,1.678357,0.179352,0.027976,0.166972,,,,,,,28.724846
min,3046036.0,11.0,2017.0,0.0,0.0,0.072957,0.846154,0.0,0.153846,,,,283.099578
25%,3047617.0,424.0,2018.0,0.454998,0.0,0.295455,0.846154,0.0,0.153846,,,,338.0
50%,3049242.0,995.0,2019.0,0.578133,0.0,0.396955,0.846154,0.0,0.153846,,,,363.0
75%,3050842.0,3933.75,2020.0,0.68856,0.004566,0.5,0.846154,0.0,0.153846,,,,379.0
max,7183893.0,6179.0,2023.0,0.948444,0.241379,0.981818,0.846154,0.0,0.153846,,,,443.119375


In [39]:
factors_photoperiod_UPS.to_csv('../data\\factors_photoperiod_UPS_DE.csv', index=False)