#  ==================================
# IMPORTS
#  ==================================

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# no text wrapping
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', 400)

In [2]:
# import dataframes
df_SD = pd.read_pickle('df_SD.pickle')
df_FD = pd.read_pickle('df_FD.pickle')
df_LF = pd.read_pickle('df_LF.pickle')
df_Site = pd.read_pickle('df_Site.pickle')
df_TrapSupervisors = pd.read_pickle('df_TrapSupervisors.pickle')

#  ==================================
# FISH DETAILS
#  ==================================

## Combine Age 2 and Age 3 Columns

In [3]:
# check if age2 and age3 occur at the same time
(
    df_FD[df_FD['Ager_2'].notnull() & df_FD['Ager_3'].notnull()].shape[0],
    df_FD[df_FD['AGE_2'].notnull() & df_FD['AGE_3'].notnull()].shape[0],
    df_FD[df_FD['FSP_2'].notnull() & df_FD['FSP_3'].notnull()].shape[0], 
    df_FD[df_FD['Comments_2'].notnull() & df_FD['Comments_3'].notnull()].shape[0]
)

(0, 0, 0, 0)

In [5]:
# how many Ager 2 columns
df_FD[df_FD.Ager_2.notnull() | df_FD.AGE_2.notnull() | df_FD.FSP_2.notnull() | df_FD.Comments_2.notnull()].shape[0]

1683

In [4]:
# how many Ager 3 columns
df_FD[df_FD.Ager_3.notnull() | df_FD.AGE_3.notnull() | df_FD.FSP_3.notnull() | df_FD.Comments_3.notnull()].shape[0]

2339

In [6]:
# combine aging 2 and aging 3 into one set of data
df_FD['Ager_2'] = df_FD['Ager_2'].fillna(df_FD['Ager_3'])
df_FD['AGE_2'] = df_FD['AGE_2'].fillna(df_FD['AGE_3'])
df_FD['FSP_2'] = df_FD['FSP_2'].fillna(df_FD['FSP_3'])
df_FD['Comments_2'] = df_FD['Comments_2'].fillna(df_FD['Comments_3'])

In [7]:
# how many Ager 2 columns
df_FD[df_FD.Ager_2.notnull() | df_FD.AGE_2.notnull() | df_FD.FSP_2.notnull() | df_FD.Comments_2.notnull()].shape[0]

4022

## FLAG_SITE: ambiguous site data

## FLAG_AM_PM_PERIOD: leave as is

## FLAG_SEX: B and A? -> null

In [45]:
df_FD.loc[df_FD.FLAG_SEX==True, 'SEX'] = np.nan

In [46]:
df_FD.loc[:, 'SEX'].unique()

array(['F', 'M', 'U', nan], dtype=object)

## FLAG_MATURITY: 44=4, 0=null

In [50]:
df_FD.loc[df_FD.MATURITY==44, 'MATURITY'] = 4
df_FD.loc[df_FD.MATURITY==0, 'MATURITY'] = np.nan

## FLAG_FSP_1: 33=3

In [53]:
df_FD.loc[df_FD.FSP_1==33, 'FSP_1'] = 3

## FLAG_FL_STD: 10x off

## FLAG_FL_WET_FROZEN: 5 typos

## FLAG_WEIGHT_OUTLIER: 

## FLAG_GONAD_OUTLIERS:

## FLAG_MULTIPLE_SAMPLE_POSSIBILITIES and FLAG_MISNUMBERED_FISH_DETAILS: 

## FLAG_LEN_WT_RATIO_OUTLIER: 

#  ==================================
# LENGTH FREQUENCIES
#  ==================================

## FLAG_SITE: ambiguous site

## FLAG_AM_PM_PERIOD: leave as is

#  ==================================
# SAMPLES
#  ==================================

## FLAG_DATETIME: null datetime

## FLAG_HOURS_FISHED: hours_fished = "maximum "

## FLAG_SITE: ambiguous site data

## FLAG_AM_PM_PERIOD_DISCREPANCIES: FD and LF inconsistent
#### SAMPLES, LENGTH FREQUENCIES, and FISH DETAILS all flagged

## FLAG_NO_MATCHING_SAMPLE: no SAMPLE matching LF and/or FD

## MAKE GHOST SAMPLES
### for unmatched fish details and length frequencies
### NOTE:
Some ambiguous length frequencies and fish details match with eachother. However, we do not know which sample and length frequency are from the same sample. Therefore, we should make sure none of the ambiguous entries ever match automatically, and they are matched manually (eventually) in dm_apps, once that feature is implemented.

Although there are no ambiguous samples that match with length frequencies or fish details, just for throroughness, we should revise id to avoid this possibility as well. 

Therefore, we will arbitrarily add 20 to months in ids of ambiguous fish details, and 40 to months in ids of ambiguous length frequencies. These numbers have no possibility to match a false positive, and they still uniquely identify their date and site.

In [9]:
## to disambiguate ids, add 20 to all months for FD with id>2024000000, add 40 to all months for LF with id>2024000000
df_FD.loc[df_FD.id>2024000000, 'id'] = df_FD.loc[df_FD.id>2024000000, 'id'] + 200000
df_LF.loc[df_LF.id>2024000000, 'id'] = df_LF.loc[df_LF.id>2024000000, 'id'] + 400000

# verified samples are all sample ids before creating ghost samples
verified_samples = set(df_SD.id)

# add null samples where no match exists - Length Frequencies
df_SD = pd.concat([
    df_SD, 
    df_LF[~df_LF.id.isin(verified_samples)].drop_duplicates('id')[['id', 'DATETIME', 'SITE1']]
]).reset_index(drop=True)

# add null samples where no match exists - Fish Details
df_SD = pd.concat([
    df_SD, 
    df_FD[~df_FD.id.isin(verified_samples)].drop_duplicates('id')[['id', 'DATETIME', 'SITE1']]
]).reset_index(drop=True)

# add a Flag to indicate ghost samples for import
df_SD.loc[~df_SD.id.isin(verified_samples), 'FLAG_GHOST_SAMPLE'] = True

# add remarks
df_SD.loc[~df_SD.id.isin(verified_samples), 'remarks'] = 'GHOST SAMPLE, created to match with unmatched Fish Details and/or Length Frequencies'

## REMERGE JOINED DATA
(previously merged without ghost samples, remerge with ghost data)

In [10]:
# drop previously merged columns (to be re-merged)
df_SD = df_SD.drop(['total_fish_preserved', 'total_fish_measured', 'AM_PM_PERIOD', 'wt_lbs'], axis=1)

# JOIN with Fish Details table to get total_fish_preserved
# NOTE: this is an estimate, assuming all fish details are accounted for. This is the best information available
df_SD = pd.merge(
    df_SD, 
    pd.merge(df_SD, df_FD, on='id').groupby('id').count()['FISH_NO'].reset_index(),
    on='id',
    how='left'
).rename({'FISH_NO':'total_fish_preserved'}, axis=1)

# JOIN with Length Frequencies table to get total_fish_measured
# NOTE: this is an estimate, assuming all length frequencies are accounted for. This is the best information available
df_SD = pd.merge(
    df_SD, 
    pd.merge(df_SD, df_LF, on='id').groupby('id').sum()['freq'].reset_index(),
    on='id',
    how='left'
).rename({'freq':'total_fish_measured'}, axis=1)

# # JOIN with Fish Details and Length Frequencies to get AM_PM_PERIOD 
# # NOTE: discrepancies flagged between Length Frequencies and Fish Details
df_period = pd.merge(
    df_FD[df_FD.PERIOD.notnull()].groupby('id').first().reset_index()[['id', 'PERIOD']],
    df_LF[df_LF.period.notnull()].groupby('id').first().reset_index()[['id', 'period']], 
    on='id',
    how='outer'
)
df_period['AM_PM_PERIOD'] = df_period['PERIOD'].fillna(df_period['period'])

df_SD = pd.merge(
    df_SD, 
    df_period[['id', 'AM_PM_PERIOD']],
    on='id',
    how='left'
)

# JOIN with Length Frequency table to get sample weight
df_SD = pd.merge(
    df_SD, 
    df_LF[['id', 'wt_lbs']],
    on='id', 
    how='left'  # all samples 
).drop_duplicates().reset_index(drop=True)

#  ==================================
# LF Grouped
#  ==================================

## Recreate df_LF_grouped with new ghost ids for LF entries

In [17]:
# group by length bins, include only imported columns 
df_LF_grouped = df_LF.groupby(['id', 'length_bin_id']).sum('freq').reset_index()[['freq', 'length_bin_id', 'id']].rename({
    'freq': 'count', 
    'length_bin_id': 'length_bin_id', 
    'id': 'sample_id'
}, axis=1).reset_index(drop=True)[['sample_id', 'length_bin_id', 'count']]

#  ==================================
# SAVE DATA
#  ==================================

In [12]:
# reorder columns for clarity - all flags at the end
df_SD = df_SD[[x for x in list(df_SD.columns) if 'FLAG' not in str(x)] + [x for x in list(df_SD.columns) if 'FLAG' in str(x)]]
df_FD = df_FD[[x for x in list(df_FD.columns) if 'FLAG' not in str(x)] + [x for x in list(df_FD.columns) if 'FLAG' in str(x)]]
df_LF = df_LF[[x for x in list(df_LF.columns) if 'FLAG' not in str(x)] + [x for x in list(df_LF.columns) if 'FLAG' in str(x)]]

In [13]:
# save pickle files of dataframes
if save_pickles := False:
    df_SD.to_pickle('df_SD.pickle')
    df_FD.to_pickle('df_FD.pickle')
    df_LF.to_pickle('df_LF.pickle')
    df_LF_grouped.to_pickle('df_LF_grouped.pickle')
    df_Site.to_pickle('df_Site.pickle')
    df_TrapSupervisors.to_pickle('df_TrapSupervisors.pickle')

# save files to this workbook drive
if save_csv := False:
    df_SD.to_csv('gaspereau_sample_data.csv', index=False)
    df_FD.to_csv('gaspereau_fish_details.csv', index=False)
    df_LF.to_csv('gaspereau_length_frequencies.csv', index=False)
    df_Site.to_csv('gaspereau_sites.csv', index=False)
    df_TrapSupervisors.to_csv('gaspereau_trap_supervisors.csv', index=False)
    df_LF_grouped.to_csv('gaspereau_LF_grouped.csv', index=False)
    
# upload to temp folder for import into Kevin's local dm_apps
import_file_location = 'C:\\Users\\CARRK\\Documents\\Repositories\\dm_app_root\\dm_apps\\herring\\temp\\'

if upload_csv_to_temp_folder := False:
    df_SD.to_csv(import_file_location + 'gaspereau_sample_data.csv', index=False)
    df_FD.to_csv(import_file_location + 'gaspereau_fish_details.csv', index=False)
    df_Site.to_csv(import_file_location + 'gaspereau_sites.csv', index=False)
    df_TrapSupervisors.to_csv(import_file_location + 'gaspereau_trap_supervisors.csv', index=False)
    df_LF_grouped.to_csv(import_file_location + 'gaspereau_LF_grouped.csv', index=False)