## Imports and functions

In [1]:
import datetime
from pathlib import Path
from typing import Dict, List, Union
import warnings
warnings.filterwarnings("ignore")

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib.ticker as ticker
from matplotlib.patches import Patch
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

%matplotlib inline
pd.options.mode.chained_assignment = None  # default='warn'
plt.rcParams.update({'figure.max_open_warning': 0})
PARAMS = {'legend.fontsize': 'xx-large',
          'legend.title_fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(PARAMS)

TIME_FORMAT = '%Y-%m-%d'
INT_TYPE = 'int32'

In [2]:
def load_data(locus_weights_path: Union[str, Path],
              temperature_path: Union[str, Path],
              locus_group_matching_path: Union[str, Path],
              fresh_water_dates_path: Union[str, Path],
              sw_fw_matching_path: Union[str, Path],
              sw_fw_matching_path_with_cnt: Union[str, Path],
              final_locus_weighted_path: Union[str, Path],
              mortality_path: Union[str, Path]) -> Dict[str, pd.DataFrame]:
    
    locus_weights = pd.read_csv(locus_weights_path)
    locus_weights.starttime = pd.to_datetime(locus_weights.starttime, format=TIME_FORMAT)
    locus_weights.endtime = pd.to_datetime(locus_weights.endtime, format=TIME_FORMAT)
    
    temperature = pd.read_csv(temperature_path)
#     temperature.event_date = pd.to_datetime(temperature.event_date, format=TIME_FORMAT)
#     temperature.locus_group_id=temperature.locus_group_id.astype(INT_TYPE)
#     temperature['event_year'] = temperature['event_date'].dt.year

    locus_group_matching = pd.read_csv(locus_group_matching_path)
    locus_group_matching = locus_group_matching.astype(INT_TYPE)

    fresh_water_dates = pd.read_csv(fresh_water_dates_path)
    for d in ['first_movement_date', 'first_feeding_date', 'shipout_date']:
        fresh_water_dates[d] = pd.to_datetime(fresh_water_dates[d], format=TIME_FORMAT)

    sw_fw_matching = pd.read_csv(sw_fw_matching_path)
    sw_fw_matching_with_cnt = pd.read_csv(sw_fw_matching_path_with_cnt)

    sw_fw_cols = ['target_seawater_locus_id',
                  'transport_date',
                  'ponding_date',
                  'pretransfer_fw_locus_population_id',
                  'fish_count_shipped_out',
                  'avg_weight_g_stocked']
    sw_fw_matching = sw_fw_matching[sw_fw_matching.origin_site_type == 'Freshwater'][sw_fw_cols]
    sw_fw_matching.pretransfer_fw_locus_population_id = sw_fw_matching.\
    pretransfer_fw_locus_population_id.astype(INT_TYPE)

    final_locus_weighted = pd.read_csv(final_locus_weighted_path)
    final_locus_weighted.event_date = pd.to_datetime(final_locus_weighted.event_date)

    mortality = pd.read_csv(mortality_path)
    mortality['transfer_date'] = pd.to_datetime(mortality['transfer_date'], format=TIME_FORMAT)
    mortality['transport_year']= mortality['transfer_date'].dt.year


    dataframes = {
        'temperature': temperature,
        'locus_weigts': locus_weights,
        'locus_group_matching': locus_group_matching,
        'final_locus_weighted': final_locus_weighted,
        'fresh_water_dates': fresh_water_dates,
        'sw_fw_matching': sw_fw_matching,
        'sw_fw_matching_with_cnt': sw_fw_matching_with_cnt,
        'mortality': mortality
    }
    
    return dataframes

def weighted_avg(x, weight, factor):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        tmp = x[[weight, factor]].dropna()
        weighted_sum = (tmp[weight] * tmp[factor]).sum()
        count_sum = tmp[weight].sum()
        return weighted_sum / count_sum
    
#this is to not calculate aggregates if at least one value on the main factor column is NaN
def apply_condition(group,main_factor,agg_function):
    if group[main_factor].isnull().any():
        return np.nan
    else:
        return agg_function(group[main_factor])


def create_ph_aggregated_factors(ph_df, main_factor, agg_function, function_name, env_type='ph'):
    """Creates features on final_locus_population_id_level"""
    
    after_ff_len = 4 * 7
    
    after_ff_period = ph_df['event_date'].between(
        ph_df['first_feeding_date'],
        ph_df['first_feeding_date'] + pd.to_timedelta(after_ff_len, unit='D')
    )

    before_transfer = ph_df['event_date'] < ph_df['shipout_date']

    after_vaccination = ph_df['event_date'] > ph_df['VAC_EVENT_DATE']
    before_vaccination = ph_df['event_date'] < ph_df['VAC_EVENT_DATE']

    after_first_feeding = ph_df['event_date'] > ph_df['first_feeding_date']

    week_before_vac = ph_df['event_date'] > ph_df['VAC_EVENT_DATE'] - pd.to_timedelta(1, unit='W')
    week_after_vac = ph_df['event_date'] < ph_df['VAC_EVENT_DATE'] + pd.to_timedelta(1, unit='W')

    agg_df = pd.DataFrame()
    
    agg_df[f'{env_type}-whole_period-{function_name}'] = ph_df\
    .groupby('final_locus_population_id').apply(lambda x: apply_condition(x, main_factor, agg_function))
    
#     agg_df[f'{env_type}-after_first_feeding-{function_name}'] = ph_df[after_ff_period] \
#     .groupby('final_locus_population_id').apply(lambda x: apply_condition(x, main_factor, agg_function))
    
#     agg_df[f'{env_type}-transfer_feeding-{function_name}'] = ph_df[before_transfer & after_first_feeding] \
#     .groupby('final_locus_population_id').apply(lambda x: apply_condition(x, main_factor, agg_function))

    agg_df[f'{env_type}-transfer_vaccination-{function_name}'] = ph_df[before_transfer & after_vaccination]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x, main_factor, agg_function))

#     agg_df[f'{env_type}-vaccination-feeding-{function_name}'] = ph_df[before_vaccination & after_first_feeding]\
#     .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x, main_factor, agg_function))

    agg_df[f'{env_type}-around_vaccination-{function_name}'] = ph_df[week_after_vac & week_before_vac]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x, main_factor, agg_function))

    
    mean_df = pd.concat([agg_df.mean(), agg_df.std()], axis=1).rename(columns={0: 'mean', 1: 'std'})

    for col in agg_df.columns[:]:
        agg_df[f'{col}-from_mean'] = agg_df[col] - mean_df.loc[col]['mean']
        agg_df[f'{col}-from_mean-abs'] = np.abs(agg_df[col] - mean_df.loc[col]['mean'])

    return agg_df


def create_degree_days_aggregated_factor(ph_df, main_factor, env_type='ph'):
    """Creates features on final_locus_population_id_level"""
    agg_df = pd.DataFrame()
    
    days = ph_df[before_transfer & after_first_feeding].groupby('final_locus_population_id')['event_date'] \
    .agg(['max','min'])
    days['diff'] = (days['max'] - days['min']).dt.days
    degrees = ph_df[before_transfer & after_first_feeding].groupby('final_locus_population_id')[main_factor] \
    .agg(['mean'])
    agg_df['fw_cycle_length'] = days['diff']
    agg_df[f'{env_type}-transfer_feeding-degree_days'] = days['diff'] * degrees['mean']

    days = ph_df[before_transfer & after_vaccination].groupby('final_locus_population_id')['event_date'] \
    .agg(['max','min'])
    days['diff'] = (days['max'] - days['min']).dt.days
    degrees = ph_df[before_transfer & after_vaccination].groupby('final_locus_population_id')[main_factor] \
    .agg(['mean'])
    agg_df[f'{env_type}-transfer_vaccination-degree_days'] = days['diff'] * degrees['mean']

    days = ph_df[before_vaccination & after_first_feeding].groupby('final_locus_population_id')['event_date'] \
    .agg(['max','min'])
    days['diff'] = (days['max'] - days['min']).dt.days
    degrees = ph_df[before_vaccination & after_first_feeding].groupby('final_locus_population_id')[main_factor] \
    .agg(['mean'])
    agg_df[f'{env_type}-vaccination_feeding-degree_days'] = days['diff'] * degrees['mean']

    for week in weeks_before_transfer:
        weeks_before_transfer_date = ph_df['shipout_date'] - pd.to_timedelta(week, unit='W')
        condition = ph_df['event_date'] > weeks_before_transfer_date

        days = ph_df[before_transfer & condition].groupby('final_locus_population_id')['event_date'] \
        .agg(['max','min'])
        days['diff'] = (days['max'] - days['min']).dt.days
        degrees = ph_df[before_transfer & condition].groupby('final_locus_population_id')[main_factor] \
        .agg(['mean'])
        agg_df[f'{env_type}-{week}_weeks_before_transfer-degree_days'] = days['diff'] * degrees['mean']

#     mean_df = pd.concat([agg_df.mean(), agg_df.std()], axis=1).rename(columns={0: 'mean', 1: 'std'})
    
#     for col in agg_df.columns[:]:
#         agg_df[f'{col}-from_mean'] = agg_df[col] - mean_df.loc[col]['mean']
#         agg_df[f'{col}-from_mean-abs'] = np.abs(agg_df[col] - mean_df.loc[col]['mean'])

    return agg_df


def create_ph_aggregated_factors_threshold(ph_df, main_factor, lower_bound, upper_bound, agg_function, condition_type, env_type='ph'):
    """Creates features on final_locus_population_id_level"""
    agg_df = pd.DataFrame()
    
    higher_max = f'{env_type}-{condition_type}-higher_max-{upper_bound}'
    lower_min = f'{env_type}-{condition_type}-lower_min-{lower_bound}'
    btw_min_max = f'{env_type}-{condition_type}-btw_min_max-{lower_bound}-{upper_bound}'
    
    relative_higher_max = f'{env_type}-{condition_type}-relative_higher_max-{upper_bound}'
    relative_lower_min = f'{env_type}-{condition_type}-relative_lower_min-{lower_bound}'
    relative_out_min_max = f'{env_type}-{condition_type}-relative_out_min_max-{lower_bound}-{upper_bound}'


    agg_df[higher_max] = ph_df[ph_df[main_factor] > upper_bound]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x,main_factor,agg_function))

    agg_df[lower_min] = ph_df[ph_df[main_factor] < lower_bound]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x,main_factor,agg_function))

    agg_df[btw_min_max] = ph_df[ph_df[main_factor].between(lower_bound, upper_bound)]\
    .groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x,main_factor,agg_function))

    agg_df['measurements_count'] = ph_df.groupby(['final_locus_population_id']).apply(lambda x: apply_condition(x,main_factor,agg_function))

    agg_df[higher_max] = agg_df[higher_max].fillna(0).astype('int64')
    agg_df[lower_min] = agg_df[lower_min].fillna(0).astype('int64')
    agg_df[btw_min_max] = agg_df[btw_min_max].fillna(0).astype('int64')

    
    agg_df[relative_higher_max] = agg_df[higher_max] / agg_df['measurements_count']
    agg_df[relative_lower_min] =  agg_df[lower_min] /  agg_df['measurements_count']
    agg_df[relative_out_min_max] = 1 - (agg_df[btw_min_max] /agg_df['measurements_count'])
    agg_df.drop(columns='measurements_count', inplace=True)
        
    return agg_df

def create_factors_df(agg_ph_df, factors, key_columns, weight_column, weighted_func):
    """Creates factors df on key_columns level using weight_column to calculate weighted average"""
    factors_dfs = []
    print('Weighting factors')
    for factor in tqdm(factors):
        tmp = agg_ph_df.groupby(key_columns)\
        .apply(weighted_func, weight_column, factor).reset_index().rename(columns={0: factor})
        factors_dfs.append(tmp)

    factor_df = factors_dfs[0]
    for df in factors_dfs[1:]:
        factor_df = factor_df.merge(df, on=key_columns, how='inner')
    return factor_df

def plot_factors(factors_df, path):
    pairplot = sns.pairplot(factors_df[np.append(corr_df.index, 'mortality')], size=(20, 15))
    plt.title('Mortality vs feature pairplot')
    plt.show()

    for ax in pairplot.axes.flatten():
        # rotate x axis labels
        ax.set_xlabel(ax.get_xlabel(), rotation=45)
        # rotate y axis labels
        ax.set_ylabel(ax.get_ylabel(), rotation=0)
        # set y labels alignment
        ax.xaxis.get_label().set_horizontalalignment('right')
        ax.yaxis.get_label().set_horizontalalignment('right')
    pairplot.figure.savefig(path / 'mortality_vs_feature_pairplot.jpg')

def plot_swarm(df, factor, target, bins, y_scale=1, title=''):
    sns.set_style("whitegrid")

    arr_for_chart = df.copy()
    arr_for_chart[factor+'_binned'] = pd.cut(df[factor], bins=bins)
    arr_for_chart['mortality'] = arr_for_chart['mortality'] * 100
    arr_for_chart['TGC_SW'] = arr_for_chart['TGC_SW']
    x_axis_factor = factor + '_binned'

    plt.figure()
    f, axs = plt.subplots(1, 1, figsize=(18,8), sharex=True)
    box_plot = sns.swarmplot(data=arr_for_chart, x=x_axis_factor, y=target)

    mean_target = arr_for_chart.groupby(x_axis_factor)[target].mean()
    mean_main_factor = arr_for_chart.groupby(x_axis_factor)[main_factor].mean() 

#     vertical_offset = arr_for_chart[y_axis_factor].mean() * 0.00 # offset for display

    for xtick in box_plot.get_xticks():
        box_plot.text(xtick,
                      mean_target.iloc[xtick],
                      round(mean_target.iloc[xtick], 2),
                      horizontalalignment='center',
                      size='xx-large',
                      color='black',
                      weight='semibold')
        box_plot.text(xtick,
                      mean_main_factor.iloc[xtick] / y_scale,
                      round(mean_main_factor.iloc[xtick], 2),
                      horizontalalignment='center',
                      size='xx-large',
                      color='darkblue',
                      weight='semibold')
    
    plt.title(f'Swarmplot {title}: {target} vs. {factor}')
    axs.legend(
        arr_for_chart[x_axis_factor].value_counts().sort_index().apply(lambda x: 'Sample count: ' + str(x)),
        loc='upper right'
    )

    f.show()

#     f.savefig(f'swarmplot_{target}_vs_{factor}.jpg')

## Data processing

In [3]:
dataframes = load_data(locus_weights_path='../data/evt_movement_ratio_with_dates.csv',
                       temperature_path='../data/FW_temperature_cleared.csv',
                       locus_group_matching_path='../data/locus_locus_group_matching.csv',
                       fresh_water_dates_path='../data/FW_cycle_dates.csv',
                       sw_fw_matching_path='../data/seawater_freshwater_matching.csv',
                       sw_fw_matching_path_with_cnt='../data/sw_locus_fw_locus_population_with_counts.csv',
                       final_locus_weighted_path='../data/lw_alldates_final_locus.csv',
                       mortality_path='../data/smolt_dataset_transfers_until2023Feb28_narrow.csv')

temperature = dataframes['temperature']
locus_weights = dataframes['locus_weigts']
locus_group_matching = dataframes['locus_group_matching']
final_locus_weighted = dataframes['final_locus_weighted']
fresh_water_dates = dataframes['fresh_water_dates']
sw_fw_matching = dataframes['sw_fw_matching']
mortality = dataframes['mortality']
sw_fw_matching_with_cnt = dataframes['sw_fw_matching_with_cnt']

tgc = pd.read_csv('../data/transfers_until2023Feb28_with_sw_growth_targets.csv')
vaccines = pd.read_csv('../data/vaccines_with_final_locus_population_id_transfers_until_Jun2023.csv')
vaccines['VAC_EVENT_DATE'] = pd.to_datetime(vaccines['VAC_EVENT_DATE'] , format=TIME_FORMAT)
vaccines_agg = pd.DataFrame(vaccines.groupby('FINAL_LOCUS_POPULATION_ID')['VAC_EVENT_DATE'].max())
fresh_water_dates.rename(columns={'pretransfer_fw_locus_population_id': 'final_locus_population_id'}, inplace=True)
sw_fw_matching.transport_date = pd.to_datetime(sw_fw_matching.transport_date, format=TIME_FORMAT)
sw_fw_matching_with_cnt.transfer_date = pd.to_datetime(sw_fw_matching_with_cnt['transfer_date'], format=TIME_FORMAT)
sw_fw_matching_with_cnt['transfer_year'] = sw_fw_matching_with_cnt['transfer_date'].dt.year

temperature.event_date = pd.to_datetime(temperature.event_date, format=TIME_FORMAT)
temperature['event_year'] = temperature['event_date'].dt.year

In [4]:
photoperiod = pd.read_csv('photoperiod_final_lp_UPS.csv')

In [5]:
photoperiod.head()

Unnamed: 0,final_locus_population_id,event_date,light_1,light_2,light_3
0,36770651,2016-02-03,0.0,0.4893,0.0
1,36770651,2016-02-04,0.0,0.4893,0.0
2,36770651,2016-02-05,0.0,0.4893,0.0
3,36770651,2016-02-06,0.0,0.8636,0.0
4,36770651,2016-02-07,0.0,0.8636,0.0


In [6]:
photoperiod = photoperiod.merge(
    fresh_water_dates,
    how='inner',
)

photoperiod = photoperiod.merge(
    vaccines_agg,
    left_on='final_locus_population_id',
    right_on='FINAL_LOCUS_POPULATION_ID',
    how='left'
)

photoperiod['shipout_year'] = photoperiod['shipout_date'].dt.year

In [7]:
mortality_cols = ['locus_id',
                  'fish_group_id',
                  'transfer_year',
                  'transfer_month',
                  'transfer_month_year',
                  'transfer_season',
                  'transfer_season2',
                  'total_count',
                  'total_mortality_perc_90']

sw_cols = ['to_locus_id',
           'to_fish_group_id',
           'transfer_date',
           'from_locus_population_id',
           'from_count_stocking',
           'transfer_year']

mortality_final_locus = mortality[mortality_cols].merge(
    sw_fw_matching_with_cnt[sw_cols],
    left_on=['fish_group_id', 'locus_id','transfer_year'],
    right_on=['to_fish_group_id', 'to_locus_id','transfer_year'],
    how='left'
)

mortality_final_locus['from_locus_population_id'] = mortality_final_locus['from_locus_population_id'].fillna(0).astype('int32')

In [8]:

# six_weeks_condition = fw_mortality['event_date'] > (fw_mortality['shipout_date'] - pd.to_timedelta(6, unit='W'))
# eight_weeks_condition = fw_mortality['event_date'] > (fw_mortality['shipout_date'] - pd.to_timedelta(8, unit='W'))
# threshold_conditions = {
#     '6_weeks_before_transfer': before_transfer & six_weeks_condition,
#     '8_weeks_before_transfer': before_transfer & eight_weeks_condition,
#     'vaccination_transfer': before_transfer & after_vaccination,
#     'first_feeding_transfer': before_transfer & after_first_feeding,
# }

# weeks_before_transfer = [2, 4, 6, 8]
#name of column with values of the factor
key_columns = ['locus_id','fish_group_id','transfer_year']

## Factors aggregation

In [9]:
light_cols = [col for col in photoperiod.columns if 'light' in col]

In [10]:
agg_functions = {
#     'mean': np.mean,
#     'min': np.min,
#     'max': np.max,
#     'std': np.std,
#     'cv': lambda x: np.std(x) / np.mean(x),  # coefficient of variation
# #     'threshold': np.count_nonzero,
#     'median': np.median,
    'sum': np.sum
}

In [11]:
for main_factor in tqdm(light_cols):
    agg_dfs = []

    for name, func in agg_functions.items():
        if name != 'threshold':
            agg_df = create_ph_aggregated_factors(photoperiod,
                                                  main_factor,
                                                  agg_function=func,
                                                  function_name=name,
                                                  env_type=main_factor.lower())
            agg_dfs.append(agg_df)
        else:
            for condition_type, condition in threshold_conditions.items():
                agg_df = create_ph_aggregated_factors_threshold(
                    fw_mortality[condition],
                    main_factor,
                    min_threshold,
                    max_threshold,
                    agg_function=func,
                    condition_type=condition_type,
                    env_type=main_factor.lower()
                )
                agg_dfs.append(agg_df)

    # agg_dfs.append(create_degree_days_aggregated_factor(temperature, main_factor, env_type=main_factor.lower()))

    agg_df = pd.concat(agg_dfs, axis=1)

    agg_df = agg_df.merge(
        mortality_final_locus[['from_locus_population_id', 'locus_id','fish_group_id', 'transfer_year', 'from_count_stocking']],
        left_on=['final_locus_population_id',],
        right_on=['from_locus_population_id',],
        how='right')

    factors = agg_df.columns.difference(
        ['from_locus_population_id', 'locus_id', 'fish_group_id', 'transfer_year', 'from_count_stocking']
    )

    factors_photoperiod = create_factors_df(agg_df,
                                            factors,
                                            key_columns, 
                                            weight_column='from_count_stocking',
                                            weighted_func=weighted_avg)
    factors_photoperiod = factors_photoperiod.merge(
        pd.DataFrame(mortality_final_locus.groupby(key_columns)['total_mortality_perc_90'].mean()),
        on=key_columns,
        how='inner')

    factors_photoperiod = factors_photoperiod.merge(
        pd.DataFrame(tgc.groupby(key_columns)['TGC_SW'].mean()),
        on=key_columns,
        how='inner'
    )

    factors_photoperiod.rename(columns={'total_mortality_perc_90': 'mortality'}, inplace=True)

    display(factors_photoperiod.isna().sum())
    factors_photoperiod.to_csv(f'./factors_{main_factor.lower()}_UPS.csv', index=False)

  0%|                                                     | 0/3 [00:00<?, ?it/s]

Weighting factors



  0%|                                                     | 0/9 [00:00<?, ?it/s][A
 11%|█████                                        | 1/9 [00:08<01:05,  8.25s/it][A
 22%|██████████                                   | 2/9 [00:11<00:37,  5.43s/it][A
 33%|███████████████                              | 3/9 [00:15<00:29,  4.88s/it][A
 44%|████████████████████                         | 4/9 [00:20<00:22,  4.59s/it][A
 56%|█████████████████████████                    | 5/9 [00:23<00:16,  4.05s/it][A
 67%|██████████████████████████████               | 6/9 [00:26<00:11,  3.84s/it][A
 78%|███████████████████████████████████          | 7/9 [00:29<00:07,  3.56s/it][A
 89%|████████████████████████████████████████     | 8/9 [00:32<00:03,  3.34s/it][A
100%|█████████████████████████████████████████████| 9/9 [00:35<00:00,  3.94s/it][A


locus_id                                            0
fish_group_id                                       0
transfer_year                                       0
light_1-around_vaccination-sum                    279
light_1-around_vaccination-sum-from_mean          279
light_1-around_vaccination-sum-from_mean-abs      279
light_1-transfer_vaccination-sum                  338
light_1-transfer_vaccination-sum-from_mean        338
light_1-transfer_vaccination-sum-from_mean-abs    338
light_1-whole_period-sum                          136
light_1-whole_period-sum-from_mean                136
light_1-whole_period-sum-from_mean-abs            136
mortality                                           0
TGC_SW                                             66
dtype: int64

 33%|███████████████                              | 1/3 [00:48<01:37, 48.60s/it]

Weighting factors



  0%|                                                     | 0/9 [00:00<?, ?it/s][A
 11%|█████                                        | 1/9 [00:03<00:25,  3.17s/it][A
 22%|██████████                                   | 2/9 [00:06<00:22,  3.25s/it][A
 33%|███████████████                              | 3/9 [00:09<00:19,  3.17s/it][A
 44%|████████████████████                         | 4/9 [00:12<00:16,  3.20s/it][A
 56%|█████████████████████████                    | 5/9 [00:16<00:13,  3.27s/it][A
 67%|██████████████████████████████               | 6/9 [00:19<00:09,  3.18s/it][A
 78%|███████████████████████████████████          | 7/9 [00:22<00:06,  3.09s/it][A
 89%|████████████████████████████████████████     | 8/9 [00:25<00:03,  3.09s/it][A
100%|█████████████████████████████████████████████| 9/9 [00:28<00:00,  3.18s/it][A


locus_id                                            0
fish_group_id                                       0
transfer_year                                       0
light_2-around_vaccination-sum                    279
light_2-around_vaccination-sum-from_mean          279
light_2-around_vaccination-sum-from_mean-abs      279
light_2-transfer_vaccination-sum                  338
light_2-transfer_vaccination-sum-from_mean        338
light_2-transfer_vaccination-sum-from_mean-abs    338
light_2-whole_period-sum                          136
light_2-whole_period-sum-from_mean                136
light_2-whole_period-sum-from_mean-abs            136
mortality                                           0
TGC_SW                                             66
dtype: int64

 67%|██████████████████████████████               | 2/3 [01:24<00:40, 40.97s/it]

Weighting factors



  0%|                                                     | 0/9 [00:00<?, ?it/s][A
 11%|█████                                        | 1/9 [00:03<00:24,  3.08s/it][A
 22%|██████████                                   | 2/9 [00:06<00:21,  3.11s/it][A
 33%|███████████████                              | 3/9 [00:09<00:18,  3.08s/it][A
 44%|████████████████████                         | 4/9 [00:12<00:15,  3.03s/it][A
 56%|█████████████████████████                    | 5/9 [00:16<00:14,  3.62s/it][A
 67%|██████████████████████████████               | 6/9 [00:20<00:10,  3.50s/it][A
 78%|███████████████████████████████████          | 7/9 [00:23<00:06,  3.34s/it][A
 89%|████████████████████████████████████████     | 8/9 [00:26<00:03,  3.26s/it][A
100%|█████████████████████████████████████████████| 9/9 [00:29<00:00,  3.26s/it][A


locus_id                                            0
fish_group_id                                       0
transfer_year                                       0
light_3-around_vaccination-sum                    279
light_3-around_vaccination-sum-from_mean          279
light_3-around_vaccination-sum-from_mean-abs      279
light_3-transfer_vaccination-sum                  338
light_3-transfer_vaccination-sum-from_mean        338
light_3-transfer_vaccination-sum-from_mean-abs    338
light_3-whole_period-sum                          136
light_3-whole_period-sum-from_mean                136
light_3-whole_period-sum-from_mean-abs            136
mortality                                           0
TGC_SW                                             66
dtype: int64

100%|█████████████████████████████████████████████| 3/3 [02:00<00:00, 40.30s/it]


In [12]:
cycles = pd.read_csv('../data/factors_cycle.csv')
cycles['fw_cycle_length_first_feeding_vac'] = cycles['fw_cycle_length_first_feeding_shipout'] - cycles['fw_cycle_length_vac_shipout']

In [13]:
normalized_factors = []
non_normalized_factors = []

for main_factor in light_cols:
    factors_photoperiod = pd.read_csv(f'./factors_{main_factor.lower()}_UPS.csv')
    non_normalized_factors.append(factors_photoperiod)
    factors_photoperiod_normalized = factors_photoperiod.copy()
    factors_photoperiod_normalized = factors_photoperiod_normalized.merge(
        cycles, how='inner', on=key_columns
    )
    
    for col in factors_photoperiod_normalized.columns:
        if 'after_first_feeding' in col:
            factors_photoperiod_normalized[col] = factors_photoperiod_normalized[col] / (4 * 7 + 1)
        elif 'around_vaccination' in col:
            factors_photoperiod_normalized[col] = factors_photoperiod_normalized[col] / (13)
        elif 'transfer_feeding' in col:
            factors_photoperiod_normalized[col] /= factors_photoperiod_normalized[
                'fw_cycle_length_first_feeding_shipout'
            ]
        elif 'transfer_vaccination' in col:
            factors_photoperiod_normalized[col] /= factors_photoperiod_normalized[
                'fw_cycle_length_vac_shipout'
            ]
        elif 'vaccination-feeding' in col:
            factors_photoperiod_normalized[col] /= factors_photoperiod_normalized[
                'fw_cycle_length_vac_shipout'
            ]
        elif 'vaccination-feeding' in col:
            factors_photoperiod_normalized[col] /= factors_photoperiod_normalized[
                'fw_cycle_length_first_feeding_vac'
            ]
        elif 'whole_period' in col:
            factors_photoperiod_normalized[col] /= factors_photoperiod_normalized[
                'fw_cycle_length_first_mvmt_shipout'
            ]
        else:
            pass
    factors_photoperiod_normalized.to_csv(f'./factors_{main_factor.lower()}_normalized_UPS.csv', index=False)
    normalized_factors.append(factors_photoperiod_normalized)

In [14]:
factors_photoperiod_normalized_merged = pd.concat(normalized_factors, axis=1)
factors_photoperiod_non_normalized_merged = pd.concat(non_normalized_factors, axis=1)

In [15]:
factors_photoperiod_normalized_merged = pd.concat(normalized_factors, axis=1)
factors_photoperiod_normalized_merged.drop(
    ['fw_cycle_length_first_mvmt_shipout',
     'fw_cycle_length_first_feeding_shipout',
     'fw_cycle_length_vac_shipout',
     'fw_cycle_length_first_feeding_vac',
     'TGC_SW',
     'mortality'
    ], axis=1, inplace=True
)

factors_photoperiod_normalized_merged = factors_photoperiod_normalized_merged.loc[
    :,~factors_photoperiod_normalized_merged.columns.duplicated()
]

for col in factors_photoperiod_normalized_merged:
    if col not in key_columns:
        factors_photoperiod_normalized_merged.rename(columns={col: f'{col}_normalized'}, inplace=True)

factors_photoperiod_normalized_merged

Unnamed: 0,locus_id,fish_group_id,transfer_year,light_1-around_vaccination-sum_normalized,light_1-around_vaccination-sum-from_mean_normalized,light_1-around_vaccination-sum-from_mean-abs_normalized,light_1-transfer_vaccination-sum_normalized,light_1-transfer_vaccination-sum-from_mean_normalized,light_1-transfer_vaccination-sum-from_mean-abs_normalized,light_1-whole_period-sum_normalized,...,light_2-whole_period-sum-from_mean-abs_normalized,light_3-around_vaccination-sum_normalized,light_3-around_vaccination-sum-from_mean_normalized,light_3-around_vaccination-sum-from_mean-abs_normalized,light_3-transfer_vaccination-sum_normalized,light_3-transfer_vaccination-sum-from_mean_normalized,light_3-transfer_vaccination-sum-from_mean-abs_normalized,light_3-whole_period-sum_normalized,light_3-whole_period-sum-from_mean_normalized,light_3-whole_period-sum-from_mean-abs_normalized
0,3046036,11,2017,,,,,,,0.000000,...,0.257180,,,,,,,0.000000,-0.058571,0.058571
1,3046036,1052,2019,0.000000,-0.692186,0.692186,0.000000,-0.705931,0.705931,0.000000,...,0.182523,0.0,-0.027826,0.027826,0.000000,-0.478029,0.478029,0.000000,-0.066045,0.066045
2,3046036,3918,2020,0.183762,-0.508424,0.508424,0.000000,-0.806778,0.806778,0.036782,...,0.033754,0.0,-0.027826,0.027826,0.734694,0.188375,0.188375,0.082949,0.024513,0.024513
3,3046036,6165,2023,1.000163,0.307978,0.307978,0.422427,-0.240531,0.240531,0.163227,...,0.083135,0.0,-0.027826,0.027826,0.534195,0.085266,0.085266,0.077622,0.020758,0.020758
4,3046043,310,2017,,,,,,,0.000000,...,0.306592,,,,,,,0.000000,-0.058707,0.058707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1257,7183889,6071,2022,1.000064,0.307878,0.307878,0.441558,-0.071846,0.071846,0.245002,...,0.096303,0.0,-0.027826,0.027826,0.545455,0.197797,0.197797,0.106395,0.047278,0.047278
1258,7183890,6072,2022,1.000004,0.307818,0.307818,0.508155,-0.095021,0.095021,0.225711,...,0.100170,0.0,-0.027826,0.027826,0.476531,0.068084,0.068084,0.081681,0.020422,0.020422
1259,7183891,6072,2022,0.999977,0.307791,0.307791,0.484375,-0.133315,0.133315,0.222032,...,0.099306,0.0,-0.027826,0.027826,0.500000,0.081724,0.081724,0.078026,0.016767,0.016767
1260,7183892,6069,2022,0.984519,0.292333,0.292333,0.536498,-0.018729,0.030687,0.204900,...,0.098371,0.0,-0.027826,0.027826,0.449438,0.073460,0.073460,0.081812,0.019739,0.019739


In [16]:
factors_photoperiod_non_normalized_merged = factors_photoperiod_non_normalized_merged.loc[
    :,~factors_photoperiod_non_normalized_merged.columns.duplicated()
]

factors_photoperiod_non_normalized_merged.drop(
    ['TGC_SW',
     'mortality'], axis=1, inplace=True
)

factors_photoperiod_non_normalized_merged

Unnamed: 0,locus_id,fish_group_id,transfer_year,light_1-around_vaccination-sum,light_1-around_vaccination-sum-from_mean,light_1-around_vaccination-sum-from_mean-abs,light_1-transfer_vaccination-sum,light_1-transfer_vaccination-sum-from_mean,light_1-transfer_vaccination-sum-from_mean-abs,light_1-whole_period-sum,...,light_2-whole_period-sum-from_mean-abs,light_3-around_vaccination-sum,light_3-around_vaccination-sum-from_mean,light_3-around_vaccination-sum-from_mean-abs,light_3-transfer_vaccination-sum,light_3-transfer_vaccination-sum-from_mean,light_3-transfer_vaccination-sum-from_mean-abs,light_3-whole_period-sum,light_3-whole_period-sum-from_mean,light_3-whole_period-sum-from_mean-abs
0,3046036,11,2017,,,,,,,0.000000,...,111.359123,,,,,,,0.000000,-25.361343,25.361343
1,3046036,1052,2019,0.000000,-8.998415,8.998415,0.000000,-39.532146,39.532146,0.000000,...,70.088960,0.0,-0.361739,0.361739,0.000000,-26.769642,26.769642,0.000000,-25.361343,25.361343
2,3046036,3918,2020,2.388900,-6.609515,6.609515,0.000000,-39.532146,39.532146,15.963200,...,14.649087,0.0,-0.361739,0.361739,36.000000,9.230358,9.230358,36.000000,10.638657,10.638657
3,3046036,6165,2023,13.002123,4.003708,4.003708,25.189310,-14.342836,14.342836,72.799433,...,37.078135,0.0,-0.361739,0.361739,31.854046,5.084404,5.084404,34.619243,9.257900,9.257900
4,3046043,310,2017,,,,,,,0.000000,...,132.447555,,,,,,,0.000000,-25.361343,25.361343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1257,7183889,6071,2022,13.000830,4.002415,4.002415,34.000000,-5.532146,5.532146,105.105746,...,41.314187,0.0,-0.361739,0.361739,42.000000,15.230358,15.230358,45.643432,20.282090,20.282090
1258,7183890,6072,2022,13.000046,4.001630,4.001630,33.304481,-6.227666,6.227666,93.444270,...,41.470388,0.0,-0.361739,0.361739,31.231840,4.462198,4.462198,33.816114,8.454771,8.454771
1259,7183891,6072,2022,12.999700,4.001285,4.001285,31.000000,-8.532146,8.532146,91.921300,...,41.112887,0.0,-0.361739,0.361739,32.000000,5.230358,5.230358,32.302700,6.941357,6.941357
1260,7183892,6069,2022,12.798749,3.800334,3.800334,38.198667,-1.333480,2.184911,83.715943,...,40.191482,0.0,-0.361739,0.361739,32.000000,5.230358,5.230358,33.426104,8.064761,8.064761


In [17]:
factors_photoperiod_UPS = factors_photoperiod_non_normalized_merged.merge(
    factors_photoperiod_normalized_merged,
    how='inner',
    on=key_columns
)

In [18]:
factors_photoperiod_UPS

Unnamed: 0,locus_id,fish_group_id,transfer_year,light_1-around_vaccination-sum,light_1-around_vaccination-sum-from_mean,light_1-around_vaccination-sum-from_mean-abs,light_1-transfer_vaccination-sum,light_1-transfer_vaccination-sum-from_mean,light_1-transfer_vaccination-sum-from_mean-abs,light_1-whole_period-sum,...,light_2-whole_period-sum-from_mean-abs_normalized,light_3-around_vaccination-sum_normalized,light_3-around_vaccination-sum-from_mean_normalized,light_3-around_vaccination-sum-from_mean-abs_normalized,light_3-transfer_vaccination-sum_normalized,light_3-transfer_vaccination-sum-from_mean_normalized,light_3-transfer_vaccination-sum-from_mean-abs_normalized,light_3-whole_period-sum_normalized,light_3-whole_period-sum-from_mean_normalized,light_3-whole_period-sum-from_mean-abs_normalized
0,3046036,11,2017,,,,,,,0.000000,...,0.257180,,,,,,,0.000000,-0.058571,0.058571
1,3046036,1052,2019,0.000000,-8.998415,8.998415,0.000000,-39.532146,39.532146,0.000000,...,0.182523,0.0,-0.027826,0.027826,0.000000,-0.478029,0.478029,0.000000,-0.066045,0.066045
2,3046036,3918,2020,2.388900,-6.609515,6.609515,0.000000,-39.532146,39.532146,15.963200,...,0.033754,0.0,-0.027826,0.027826,0.734694,0.188375,0.188375,0.082949,0.024513,0.024513
3,3046036,6165,2023,13.002123,4.003708,4.003708,25.189310,-14.342836,14.342836,72.799433,...,0.083135,0.0,-0.027826,0.027826,0.534195,0.085266,0.085266,0.077622,0.020758,0.020758
4,3046043,310,2017,,,,,,,0.000000,...,0.306592,,,,,,,0.000000,-0.058707,0.058707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1257,7183889,6071,2022,13.000830,4.002415,4.002415,34.000000,-5.532146,5.532146,105.105746,...,0.096303,0.0,-0.027826,0.027826,0.545455,0.197797,0.197797,0.106395,0.047278,0.047278
1258,7183890,6072,2022,13.000046,4.001630,4.001630,33.304481,-6.227666,6.227666,93.444270,...,0.100170,0.0,-0.027826,0.027826,0.476531,0.068084,0.068084,0.081681,0.020422,0.020422
1259,7183891,6072,2022,12.999700,4.001285,4.001285,31.000000,-8.532146,8.532146,91.921300,...,0.099306,0.0,-0.027826,0.027826,0.500000,0.081724,0.081724,0.078026,0.016767,0.016767
1260,7183892,6069,2022,12.798749,3.800334,3.800334,38.198667,-1.333480,2.184911,83.715943,...,0.098371,0.0,-0.027826,0.027826,0.449438,0.073460,0.073460,0.081812,0.019739,0.019739


In [19]:
factors_photoperiod_UPS.columns

Index(['locus_id', 'fish_group_id', 'transfer_year',
       'light_1-around_vaccination-sum',
       'light_1-around_vaccination-sum-from_mean',
       'light_1-around_vaccination-sum-from_mean-abs',
       'light_1-transfer_vaccination-sum',
       'light_1-transfer_vaccination-sum-from_mean',
       'light_1-transfer_vaccination-sum-from_mean-abs',
       'light_1-whole_period-sum', 'light_1-whole_period-sum-from_mean',
       'light_1-whole_period-sum-from_mean-abs',
       'light_2-around_vaccination-sum',
       'light_2-around_vaccination-sum-from_mean',
       'light_2-around_vaccination-sum-from_mean-abs',
       'light_2-transfer_vaccination-sum',
       'light_2-transfer_vaccination-sum-from_mean',
       'light_2-transfer_vaccination-sum-from_mean-abs',
       'light_2-whole_period-sum', 'light_2-whole_period-sum-from_mean',
       'light_2-whole_period-sum-from_mean-abs',
       'light_3-around_vaccination-sum',
       'light_3-around_vaccination-sum-from_mean',
       'lig

In [20]:
factors_photoperiod_UPS.to_csv('factors_photoperiod_UPS.csv', index=False)