#  ==================================
# IMPORTS
#  ==================================

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# no text wrapping
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', 200)

In [2]:
# import dataframes
df_SD = pd.read_pickle(r'.\intermediate_data_and_calcs\df_SD.pickle')
df_FD = pd.read_pickle(r'.\intermediate_data_and_calcs\df_FD.pickle')
df_LF = pd.read_pickle(r'.\intermediate_data_and_calcs\df_LF.pickle')
df_Site = pd.read_pickle(r'.\intermediate_data_and_calcs\df_Site.pickle')
df_TrapSupervisors = pd.read_pickle(r'.\intermediate_data_and_calcs\df_TrapSupervisors.pickle')

### Helper Function to UPDATE ID to Match Updated DATETIME and SITE

In [3]:
def create_new_id(row):
    """only run before disambiguating month portion of ids for FD and LF"""
    return str(row.id)[0:4] + str(row.DATETIME.month).rjust(2,'0') + str(row.DATETIME.day).rjust(2, '0') + str(row.SITE1).rjust(2, '0')

#  ==================================
# FISH DETAILS
#  ==================================

## Combine Age 2 and Age 3 Columns

In [4]:
# check if age2 and age3 occur at the same time
(
    df_FD[df_FD['Ager_2'].notnull() & df_FD['Ager_3'].notnull()].shape[0],
    df_FD[df_FD['AGE_2'].notnull() & df_FD['AGE_3'].notnull()].shape[0],
    df_FD[df_FD['FSP_2'].notnull() & df_FD['FSP_3'].notnull()].shape[0], 
    df_FD[df_FD['Comments_2'].notnull() & df_FD['Comments_3'].notnull()].shape[0]
)

(0, 0, 0, 0)

In [5]:
# how many Ager 2 columns
df_FD[df_FD.Ager_2.notnull() | df_FD.AGE_2.notnull() | df_FD.FSP_2.notnull() | df_FD.Comments_2.notnull()].shape[0]

1683

In [6]:
# how many Ager 3 columns
df_FD[df_FD.Ager_3.notnull() | df_FD.AGE_3.notnull() | df_FD.FSP_3.notnull() | df_FD.Comments_3.notnull()].shape[0]

2339

In [7]:
# combine aging 2 and aging 3 into one set of data
df_FD['Ager_2'] = df_FD['Ager_2'].fillna(df_FD['Ager_3'])
df_FD['AGE_2'] = df_FD['AGE_2'].fillna(df_FD['AGE_3'])
df_FD['FSP_2'] = df_FD['FSP_2'].fillna(df_FD['FSP_3'])
df_FD['Comments_2'] = df_FD['Comments_2'].fillna(df_FD['Comments_3'])

In [8]:
# how many Ager 2 columns
df_FD[df_FD.Ager_2.notnull() | df_FD.AGE_2.notnull() | df_FD.FSP_2.notnull() | df_FD.Comments_2.notnull()].shape[0]

4022

## FLAG_SITE: ambiguous site data

In [9]:
# TODO: if site is changed away from SITE1, update id to match

# df_FD.loc[df_FD.FLAG_SITE==True]

## FLAG_AM_PM_PERIOD: leave as is

## FLAG_SEX: B and A? -> null

In [10]:
df_FD.loc[df_FD.FLAG_SEX==True, 'SEX'] = np.nan

In [11]:
df_FD.loc[:, 'SEX'].unique()

array(['F', 'M', 'U', nan], dtype=object)

## FLAG_MATURITY: 44=4, 0=null

In [12]:
df_FD.loc[df_FD.MATURITY==44, 'MATURITY'] = 4
df_FD.loc[df_FD.MATURITY==0, 'MATURITY'] = np.nan

## FLAG_FSP_1: 33=3

In [13]:
df_FD.loc[df_FD.FSP_1==33, 'FSP_1'] = 3

## FLAG_FL_STD: 10x off

In [14]:
# CONFIRM
df_FD.loc[df_FD.FLAG_FL_STD==True, 'FL_STD'] *= 10
df_FD.loc[df_FD.FLAG_FL_STD==True, 'FL_STD']

34923   261.000
34924   218.000
34925   260.000
34926   283.000
34927   219.000
          ...  
35504   227.000
35505   236.000
35506   208.000
35507   228.000
35508   202.000
Name: FL_STD, Length: 264, dtype: float64

## FLAG_FL_WET_FROZEN: 5 typos

In [15]:
# STD doesn't need to be converted here, it will be standardised in dm_apps from the frozen length and frozen condition
df_FD.loc[df_FD.FLAG_FL_WET_FROZEN==True, ['FL_WET', 'FL_STD']] = np.nan
df_FD.loc[df_FD.FLAG_FL_WET_FROZEN==True, 'FL_FROZEN'] = [259, 251, 228, 200, 204]
df_FD.loc[df_FD.FLAG_FL_WET_FROZEN==True, 'fish_length'] = [259, 251, 228, 200, 204]
df_FD.loc[df_FD.FLAG_FL_WET_FROZEN==True, ['CONDITION', 'FL_WET', 'FL_FROZEN', 'FL_STD', 'fish_length']]

Unnamed: 0,CONDITION,FL_WET,FL_FROZEN,FL_STD,fish_length
35249,Frozen,,259,,259
35392,Frozen,,251,,251
35478,Frozen,,228,,228
35479,Frozen,,200,,200
35507,Frozen,,204,,204


## FLAG_WEIGHT_OUTLIER: 

In [16]:
df_FD.loc[df_FD.FLAG_WEIGHT_OUTLIER==True, 'WEIGHT']

724      519.000
735      539.000
956      501.000
1777     512.000
5477     503.000
35212   1934.000
Name: WEIGHT, dtype: float64

In [17]:
# CONFIRM
df_FD.loc[df_FD.WEIGHT>1000, 'WEIGHT'] = 193.4
df_FD.loc[df_FD.FLAG_WEIGHT_OUTLIER==True, 'WEIGHT']

724     519.000
735     539.000
956     501.000
1777    512.000
5477    503.000
35212   193.400
Name: WEIGHT, dtype: float64

## FLAG_GONAD_OUTLIER:

In [18]:
# CONFIRM
df_FD.loc[df_FD.FLAG_GONAD_OUTLIER==True, 'GONAD_WEIGHT'] = np.nan
df_FD.loc[df_FD.FLAG_GONAD_OUTLIER==True, 'GONAD_WEIGHT']

5790    NaN
13647   NaN
23059   NaN
23295   NaN
24542   NaN
26462   NaN
33026   NaN
Name: GONAD_WEIGHT, dtype: float64

## FLAG_MULTIPLE_SAMPLE_POSSIBILITIES and FLAG_MISNUMBERED_FISH_DETAILS: 

In [19]:
# df_FD.loc[df_FD.FLAG_MULTIPLE_SAMPLE_POSSIBILITIES==True]

In [20]:
# df_FD.loc[df_FD.FLAG_MISNUMBERED_FISH_DETAILS==True]

## FLAG_LEN_WT_RATIO_OUTLIER: 

In [21]:
# df_FD.loc[df_FD.FLAG_LEN_WT_RATIO_OUTLIER==True, ['FL_STD', 'WEIGHT']]

#  ==================================
# LENGTH FREQUENCIES
#  ==================================

## FLAG_SITE: ambiguous site
left as is, imported linked to ghost sample, sites noted for dealing with later ('remarks' attribute)

## FLAG_AM_PM_PERIOD
leave as is

#  ==================================
# SAMPLES
#  ==================================

## FLAG_DATETIME: null datetime
Site 48, 2019 – did not fish
Site 51, 2017 – did not fish
Okay, for the samples flagged here, I suspect the ones entered from 2006 to present all represent years where that trap was not fished at all during the season. These can all be removed.

The information from site 37 in 1988 should be kept as it does appear that the trap was fishing that year. 


In [22]:
# These samples will not get imported: they don't have dates - they will be in the reject samples csv output
df_SD[df_SD.FLAG_DATETIME==True]

Unnamed: 0,DIST,RIVER,NAME,code,GEAR,SITE_NO,no_nets,YEAR,MM,DD,Week,catch_lbs,catch_kg,hours_fished,zone,last_name,comments,bycatch_sbass,bycatch_shad,bycatch_other,DATETIME,SITE1,SITE2,remarks,id,total_fish_preserved,total_fish_measured,AM_PM_PERIOD,wt_lbs,FLAG_DATETIME,FLAG_HOURS_FISHED,FLAG_SITE,FLAG_AM_PM_PERIOD_DISCREPANCIES
15182,2,SWMARG,Darlene Cameron,,81,11,1.0,2006,,,,0.0,0.0,,lower,,,,,,NaT,11,,,2006000011,,,,,True,,,
15183,2,SWMARG,John A Chisholm,,81,17,1.0,2006,,,,0.0,0.0,,lower,,,,,,NaT,17,,,2006000017,,,,,True,,,
15184,2,SWMARG,Daniel Stewart,,81,33,1.0,2006,,,,0.0,0.0,,upper,,,,,,NaT,33,,,2006000033,,,,,True,,,
15185,2,SWMARG,Michael D Gillis,,81,48,1.0,2006,,,,0.0,0.0,,,,,,,,NaT,48,,,2006000048,,,,,True,,,
15186,2,SWMARG,Kevin MacKinnon,,81,64,1.0,2006,,,,0.0,0.0,,,MacKinnon,,,,,NaT,64,,,2006000064,,,,,True,,,
15187,2,SWMARG,James MacFarlane,,81,65,1.0,2006,,,,0.0,0.0,,upper,MacFarlane,,,,,NaT,65,,,2006000065,,,,,True,,,
15188,2,SWMARG,Charles McDaniel,,81,1,1.0,2010,,,,0.0,0.0,,lower,McDaniel,,,,,NaT,1,,,2010000001,,,,,True,,,
15189,2,SWMARG,Michael D Gillis,,81,48,1.0,2010,,,,0.0,0.0,,upper,Gillis,,,,,NaT,48,,,2010000048,,,,,True,,,
15190,2,SWMARG,Vincent MacKinnon,,81,62,1.0,2010,,,,0.0,0.0,,upper,MacKinnon,,,,,NaT,62,,,2010000062,,,,,True,,,
15191,2,SWMARG,Joan Ingram,,81,34,1.0,2011,,,,0.0,0.0,0.0,upper,,,,,,NaT,34,,,2011000034,,,,,True,,,


## FLAG_HOURS_FISHED: hours_fished = "maximum "

Is it possible to import this data with MAX listed under hours fished? 

A: Not without losing functionality. It's not the cleanest option, but if we import as 99, it should be obvious that it is not an actual numeric value, and we could potentially even format it as 'MAX'. It should also be easy to search and update in the future.

In [23]:
# CONFIRM
df_SD.loc[df_SD.FLAG_HOURS_FISHED==True, 'hours_fished'] = 99
df_SD.loc[df_SD.FLAG_HOURS_FISHED==True, 'hours_fished']

13667    99
13674    99
13682    99
13691    99
13702    99
13713    99
13724    99
13736    99
13749    99
13759    99
13771    99
13782    99
13794    99
13807    99
13820    99
13832    99
13844    99
13857    99
13870    99
13881    99
13893    99
13905    99
13918    99
13931    99
13943    99
13955    99
13966    99
13976    99
Name: hours_fished, dtype: object

## FLAG_SITE: ambiguous site data
Please leave as is for now. Will try to correct when hard copy is found.

A: These will be imported linked only to site 1A, as there is no functionality currently to link multiple sites to a single sample. However, it will have a flag for ambiguity and site notes showing both sites 1A and 8.

These 23 samples appear to be the only samples requiring many to many site to sample database functionality, so many to many fuctionality has not been implemented. If this assumption is incorrect, this functionality can be added. This task will not be trivial if we decide to implement it.

In [24]:
# TODO: if site is updated, update id to match

# these are currently importing as 1A
df_SD.loc[df_SD.FLAG_SITE==True, 'SITE_NO']

6019    1A,8
6028    1A,8
6037    1A,8
6046    1A,8
6055    1A,8
6064    1A,8
6073    1A,8
6081    1A,8
6089    1A,8
6097    1A,8
6106    1A,8
6115    1A,8
6124    1A,8
6132    1A,8
6141    1A,8
6150    1A,8
6158    1A,8
6167    1A,8
6183    1A,8
6192    1A,8
6201    1A,8
6209    1A,8
6217    1A,8
Name: SITE_NO, dtype: object

## FLAG_AM_PM_PERIOD_DISCREPANCIES: FD and LF inconsistent
#### SAMPLES, LENGTH FREQUENCIES, and FISH DETAILS all flagged
* The samples will be where the AM PM data will be stored, so they are the only updates needed, outside of ghost samples, which can be updated as information becomes available.

#### Please NULL the AM/PM field for those.
this will solve this issue

In [25]:
df_SD.loc[df_SD.FLAG_AM_PM_PERIOD_DISCREPANCIES==True]

Unnamed: 0,DIST,RIVER,NAME,code,GEAR,SITE_NO,no_nets,YEAR,MM,DD,Week,catch_lbs,catch_kg,hours_fished,zone,last_name,comments,bycatch_sbass,bycatch_shad,bycatch_other,DATETIME,SITE1,SITE2,remarks,id,total_fish_preserved,total_fish_measured,AM_PM_PERIOD,wt_lbs,FLAG_DATETIME,FLAG_HOURS_FISHED,FLAG_SITE,FLAG_AM_PM_PERIOD_DISCREPANCIES
5108,2,SWMARG,Daniel Stewart,,81,33,,1993,5,13,3,1100.0,499.0,,upper,STEWART,,,,,1993-05-13,33,,,1993051333,34.0,210.0,AM,115.0,,,,True
8224,2,SWMARG,Pierre Chiasson,,81,26,1.0,2001,6,15,7,500.0,226.8,,lower,Chiasson,,,,,2001-06-15,26,,,2001061526,51.0,283.0,AM,136.0,,,,True
10432,2,SWMARG,Pierre Chiasson,,81,26,1.0,2007,6,2,5,9600.0,4354.5,8.0,lower,Chiasson,,,,,2007-06-02,26,,,2007060226,32.0,290.0,AM,130.0,,,,True
13498,2,SWMARG,Donelda M Gillis,,81,47,1.0,2014,6,10,7,450.0,204.1,7.0,upper,Gillis,,,,,2014-06-10,47,,,2014061047,38.0,246.0,AM,89.0,,,,True


In [26]:
df_FD.loc[df_FD.FLAG_AM_PM_PERIOD_DISCREPANCIES==True, 'PERIOD'] = np.nan
df_FD.loc[df_FD.FLAG_AM_PM_PERIOD_DISCREPANCIES==True, 'PERIOD'].head()

10194    NaN
10195    NaN
10196    NaN
10197    NaN
10198    NaN
Name: PERIOD, dtype: object

In [27]:
df_LF.loc[df_LF.FLAG_AM_PM_PERIOD_DISCREPANCIES==True, 'period'] = np.nan
df_LF.loc[df_LF.FLAG_AM_PM_PERIOD_DISCREPANCIES==True, 'period'].head()

2012    NaN
2013    NaN
2014    NaN
2015    NaN
2016    NaN
Name: period, dtype: object

## FLAG_NO_MATCHING_SAMPLE: no SAMPLE matching LF and/or FD
* these will end up being matched with ghost samples (below)

In [28]:
# df_FD.loc[df_FD.FLAG_NO_MATCHING_SAMPLE==True]

In [29]:
# df_LF.loc[df_LF.FLAG_NO_MATCHING_SAMPLE==True]

## MAKE GHOST SAMPLES
### for unmatched fish details and length frequencies
### NOTE:
Some ambiguous length frequencies and fish details match with eachother. However, we do not know which sample and length frequency are from the same sample. Therefore, we should make sure none of the ambiguous entries ever match automatically, and they are matched manually (eventually) in dm_apps, once that feature is implemented.

Although there are no ambiguous samples that match with length frequencies or fish details, just for throroughness, we should revise id to avoid this possibility as well. 

Therefore, we will arbitrarily add 20 to months in ids of ambiguous fish details, and 40 to months in ids of ambiguous length frequencies. These numbers have no possibility to match a false positive, and they still uniquely identify their date and site.

In [30]:
# to disambiguate ids, add 20 to all months for FD with id>2024000000, add 40 to all months for LF with id>2024000000

# these were matched using period, and corroborated using site notes and visual inspection
# 2998050105 excluded because of AM PM discrepancy
matched_period = [
    3000060726, 3000060926, 3001051605, 3009052026, 3009052226, 3010051126, 3010051926, 3010052726, 
    4000060726, 4000060926, 4001051605, 4009052026, 4009052226, 4010051126, 4010051926, 4010052726
]

df_FD.loc[(df_FD.id>2024000000) & (~df_FD.id.isin(matched_period)), 'id'] += 200000
df_LF.loc[(df_LF.id>2024000000) & (~df_LF.id.isin(matched_period)), 'id'] += 400000

# dubious matches - CONFIRM
disambiguate_dubious_id_matches = False  ### CONFIRM ###
# in general these matches look good, but they have matches like 1 ~ 1,8 ~ 1,8 (not exact matches, but the first entry always matches)
# either we can imported matched and include notes, or disambiguate and match manually later
df_ambiguous_match_notes = pd.read_pickle(r'.\intermediate_data_and_calcs\df_ambiguous_match_notes.pickle')
if disambiguate_dubious_id_matches:
    df_FD.loc[df_FD.id.isin(df_ambiguous_match_notes.id), 'id'] += 200000
    df_LF.loc[df_LF.id.isin(df_ambiguous_match_notes.id), 'id'] += 400000

# verified samples are all sample ids before creating ghost samples
verified_samples = set(df_SD.id)

# temporarily add remarks to df_LF in order to note site
df_LF['remarks'] = 'Site: ' + df_LF['site'].fillna('None')

# JOIN FD and LF without an SD match, and combine remarks, date, and site
temp = pd.merge(
    df_LF[~df_LF.id.isin(df_SD.id)].drop_duplicates('id')[['id', 'DATETIME', 'SITE1', 'remarks']], 
    df_FD[~df_FD.id.isin(df_SD.id)].drop_duplicates('id')[['id', 'DATETIME', 'SITE1', 'remarks']],
    on='id',
    how='outer'
).reset_index(drop=True)
temp.loc[temp.remarks_x.isnull() | temp.remarks_x.isnull(), 'remarks'] = temp['remarks_x'].fillna('') + temp['remarks_y'].fillna('')
temp.loc[temp.remarks_x.notnull() & temp.remarks_x.notnull(), 'remarks'] = temp['remarks_x'].fillna('') + '; ' + temp['remarks_y'].fillna('') # semicolon between if both exist
temp['DATETIME'] = temp['DATETIME_x'].fillna(temp['DATETIME_y'])
temp['SITE1'] = temp['SITE1_x'].fillna(temp['SITE1_y'])
temp = temp[['id', 'DATETIME', 'SITE1', 'remarks']]

# concatenate to the bottom of the df_SD dataframe
df_SD = pd.concat([
    df_SD, 
    temp
]).reset_index(drop=True)

# remove remarks from df_LF, not needed for df_LF
df_LF.drop('remarks', axis=1)

# add a Flag to indicate ghost samples for import
df_SD.loc[~df_SD.id.isin(verified_samples), 'FLAG_GHOST_SAMPLE'] = True

# add remarks about ghost samples
df_SD.loc[~df_SD.id.isin(verified_samples), 'remarks'] += '; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies'

# either way, add  notes to samples with dubious matches
for i, row in df_SD[df_SD.id.isin(df_ambiguous_match_notes.id)].iterrows():
    current_id = row['id']
    if df_SD.loc[df_SD.id==current_id, 'remarks'].isnull().any():
        df_SD.loc[df_SD.id==current_id, 'remarks'] = df_ambiguous_match_notes.loc[df_ambiguous_match_notes.id==current_id, 'SITE_AMBIGUITIES'].values[0]
    else:
        df_SD.loc[df_SD.id==current_id, 'remarks'] += '; ' + df_ambiguous_match_notes.loc[df_ambiguous_match_notes.id==current_id, 'SITE_AMBIGUITIES'].values[0]
    df_SD.loc[df_SD.id==current_id, 'FLAG_AMBIGUOUS_MATCH'] = True

In [31]:
# remarks NA
df_SD[df_SD.id.isin([1993051333, 2001061526, 2007060226, 2014061047])]

Unnamed: 0,DIST,RIVER,NAME,code,GEAR,SITE_NO,no_nets,YEAR,MM,DD,Week,catch_lbs,catch_kg,hours_fished,zone,last_name,comments,bycatch_sbass,bycatch_shad,bycatch_other,DATETIME,SITE1,SITE2,remarks,id,total_fish_preserved,total_fish_measured,AM_PM_PERIOD,wt_lbs,FLAG_DATETIME,FLAG_HOURS_FISHED,FLAG_SITE,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_GHOST_SAMPLE,FLAG_AMBIGUOUS_MATCH
5108,2.0,SWMARG,Daniel Stewart,,81.0,33,,1993,5,13,3,1100.0,499.0,,upper,STEWART,,,,,1993-05-13,33,,,1993051333,34.0,210.0,AM,115.0,,,,True,,
8224,2.0,SWMARG,Pierre Chiasson,,81.0,26,1.0,2001,6,15,7,500.0,226.8,,lower,Chiasson,,,,,2001-06-15,26,,,2001061526,51.0,283.0,AM,136.0,,,,True,,
10432,2.0,SWMARG,Pierre Chiasson,,81.0,26,1.0,2007,6,2,5,9600.0,4354.5,8.0,lower,Chiasson,,,,,2007-06-02,26,,,2007060226,32.0,290.0,AM,130.0,,,,True,,
13498,2.0,SWMARG,Donelda M Gillis,,81.0,47,1.0,2014,6,10,7,450.0,204.1,7.0,upper,Gillis,,,,,2014-06-10,47,,,2014061047,38.0,246.0,AM,89.0,,,,True,,


In [32]:
[x for x in df_ambiguous_match_notes.id if x in [1993051333, 2001061526, 2007060226, 2014061047]]
# why are they ambiguous but don't have site ambiguity flags?

[]

## REMERGE JOINED DATA
(previously merged without ghost samples, remerge with ghost data)

In [33]:
# drop previously merged columns (to be re-merged)
df_SD = df_SD.drop(['total_fish_preserved', 'total_fish_measured', 'AM_PM_PERIOD', 'wt_lbs'], axis=1)

# JOIN with Fish Details table to get total_fish_preserved
# NOTE: this is an estimate, assuming all fish details are accounted for. This is the best information available
df_SD = pd.merge(
    df_SD, 
    pd.merge(df_SD, df_FD, on='id').groupby('id').count()['FISH_NO'].reset_index(),
    on='id',
    how='left'
).rename({'FISH_NO':'total_fish_preserved'}, axis=1)

# JOIN with Length Frequencies table to get total_fish_measured
# NOTE: this is an estimate, assuming all length frequencies are accounted for. This is the best information available
df_SD = pd.merge(
    df_SD, 
    pd.merge(df_SD, df_LF, on='id').groupby('id').sum()['freq'].reset_index(),
    on='id',
    how='left'
).rename({'freq':'total_fish_measured'}, axis=1)

# # JOIN with Fish Details and Length Frequencies to get AM_PM_PERIOD 
# # NOTE: discrepancies flagged between Length Frequencies and Fish Details
df_period = pd.merge(
    df_FD[df_FD.PERIOD.notnull()].groupby('id').first().reset_index()[['id', 'PERIOD']],
    df_LF[df_LF.period.notnull()].groupby('id').first().reset_index()[['id', 'period']], 
    on='id',
    how='outer'
)
df_period['AM_PM_PERIOD'] = df_period['PERIOD'].fillna(df_period['period'])

df_SD = pd.merge(
    df_SD, 
    df_period[['id', 'AM_PM_PERIOD']],
    on='id',
    how='left'
)

# JOIN with Length Frequency table to get sample weight
df_SD = pd.merge(
    df_SD, 
    df_LF[['id', 'wt_lbs']],
    on='id', 
    how='left'  # all samples 
).drop_duplicates().reset_index(drop=True)

# Fix Datetime Format

In [34]:
# some times merged into DATETIME from other df via ghost samples () 
# reduce to date and AM / PM period
df_SD['DATETIME'] = pd.to_datetime(df_SD['DATETIME'].dt.date)

#  ==================================
# LF Grouped
#  ==================================

## Recreate df_LF_grouped with new ambiguous ids for LF entries (+40 to month if ambiguous)

In [35]:
# group by length bins, include only imported columns 
df_LF_grouped = df_LF.groupby(['id', 'length_bin_id']).sum('freq').reset_index()[['freq', 'length_bin_id', 'id']].rename({
    'freq': 'count', 
    'length_bin_id': 'length_bin_id', 
    'id': 'sample_id'
}, axis=1).reset_index(drop=True)[['sample_id', 'length_bin_id', 'count']]

In [36]:
# ambiguous entries, month bit (+40 to disambiguate)
min([int(str(x)[4:6]) for x in list(df_LF_grouped[df_LF_grouped.sample_id>2024000000].sample_id.unique())])

5

In [37]:
# regular entries, month bit
max([int(str(x)[4:6]) for x in list(df_LF_grouped[df_LF_grouped.sample_id<2024000000].sample_id.unique())])

6

#  ==================================
# MORE COMMENTS
#  ==================================

CONVERT E.MACFARLANE, Eric Mac, Eric MacFarlane, J McFarlane to John Eric MacFarlane.

JA Coady keep as is

Multiple sites, keep as is. 

I see 118 samples from 1989 that are associated with ‘blank’. Can you convert ‘blank’ to Unknown and upload.

#### based on the comment: 93 and 92 are both 'John Eric MacFarlane' - should be added to sites table

In [38]:
# append these and update the import script
df_Site = pd.concat([
    df_Site,
    pd.DataFrame([['John Eric MacFarlane'], ['JA Coady']], columns=['site'])
], axis=0).reset_index(drop=True)

# Flag all Samples with ambiguous data/FD/LF etc

In [39]:
all_ambiguous_id = set(
    list(df_SD[df_SD.id>2024000000].id.unique())
    + list(df_LF[df_LF.id>2024000000].id.unique()) 
    + list(df_FD[df_FD.id>2024000000].id.unique())
    + list(df_FD[df_FD.FLAG_MULTIPLE_SAMPLE_POSSIBILITIES==True].id.unique())
    + list(df_SD[df_SD.FLAG_AM_PM_PERIOD_DISCREPANCIES==True].id.unique())
    + list(df_LF[df_LF.FLAG_AM_PM_PERIOD_DISCREPANCIES==True].id.unique())
    + list(df_FD[df_FD.FLAG_AM_PM_PERIOD_DISCREPANCIES==True].id.unique())
    + list(df_SD[df_SD.FLAG_SITE==True].id.unique())
)

# df_SD[df_SD.FLAG_SITE==True]  # already noted properly
# df_SD[df_SD.id.isin(list(df_FD[df_FD.FLAG_MULTIPLE_SAMPLE_POSSIBILITIES==True].id.unique()))]  # already noted properly
# df_SD[df_SD.id.isin(list(df_LF[df_LF.id>2024000000].id.unique()) + list(df_FD[df_FD.id>2024000000].id.unique()))]  # already noted properly
flag_multiple = sorted(
    list(df_SD[df_SD.id>2024000000].id.unique())
)

flag_period = sorted(
    list(df_SD[df_SD.FLAG_AM_PM_PERIOD_DISCREPANCIES==True].id.unique())
    + list(df_LF[df_LF.FLAG_AM_PM_PERIOD_DISCREPANCIES==True].id.unique())
    + list(df_FD[df_FD.FLAG_AM_PM_PERIOD_DISCREPANCIES==True].id.unique())
)

# flag all ambiguous samples
df_SD.loc[df_SD.id.isin(all_ambiguous_id), 'FLAG_AMBIGUOUS_SAMPLE'] = True

In [40]:
# all null, just set the remarks for this data
df_SD.loc[df_SD.id.isin(flag_period), 'remarks'] = 'AMBIGUITY: AM PM Period discrepancies between matched Fish Details and Length Frequencies (period set to null)'
# df_SD.loc[df_SD.id.isin(flag_period), 'remarks']

In [41]:
# set remarks where null, append remarks where not null
for i, row in df_SD[df_SD.id.isin(flag_multiple)].iterrows():
    current_id = row['id']
    if df_SD.loc[df_SD.id==current_id, 'remarks'].isnull().any():
        df_SD.loc[df_SD.id==current_id, 'remarks'] = 'AMBIGUITY: Multiple potential matching Samples (with same date and site location)'
    else:
        df_SD.loc[df_SD.id==current_id, 'remarks'] += '; AMBIGUITY: Multiple potential matching Samples (with same date and site location)'
        
# df_SD[df_SD.id.isin(flag_multiple)]

In [42]:
# now all ambiguous samples contain note regarding the ambiguity
df_SD.loc[df_SD.FLAG_AMBIGUOUS_SAMPLE==True, 'remarks'].str.contains('AMBIGUIT').all()

True

#  ==================================
# SAVE DATA
#  ==================================

In [43]:
# reorder columns for clarity - all flags at the end
df_SD = df_SD[[x for x in list(df_SD.columns) if 'FLAG' not in str(x)] + [x for x in list(df_SD.columns) if 'FLAG' in str(x)]]
df_FD = df_FD[[x for x in list(df_FD.columns) if 'FLAG' not in str(x)] + [x for x in list(df_FD.columns) if 'FLAG' in str(x)]]
df_LF = df_LF[[x for x in list(df_LF.columns) if 'FLAG' not in str(x)] + [x for x in list(df_LF.columns) if 'FLAG' in str(x)]]

In [56]:
# save pickle files of dataframes
if save_pickles := True:
    df_SD.to_pickle('df_SD_cleaned.pickle')
    df_FD.to_pickle('df_FD_cleaned.pickle')
    df_LF.to_pickle('df_LF_cleaned.pickle')
    df_LF_grouped.to_pickle('df_LF_grouped_cleaned.pickle')
    df_Site.to_pickle('df_Site_cleaned.pickle')
    df_TrapSupervisors.to_pickle('df_TrapSupervisors_cleaned.pickle')

# save files to this workbook drive
if save_csv := False:
    df_SD.to_csv('gaspereau_sample_data_cleaned.csv', index=False)
    df_FD.to_csv('gaspereau_fish_details_cleaned.csv', index=False)
    df_LF.to_csv('gaspereau_length_frequencies_cleaned.csv', index=False)
    df_Site.to_csv('gaspereau_sites_cleaned.csv', index=False)
    df_TrapSupervisors.to_csv('gaspereau_trap_supervisors_cleaned.csv', index=False)
    df_LF_grouped.to_csv('gaspereau_LF_grouped_cleaned.csv', index=False)
    
# upload to temp folder for import into Kevin's local dm_apps
import_file_location = 'C:\\Users\\CARRK\\Documents\\Repositories\\dm_app_root\\dm_apps\\herring\\temp\\'

if upload_csv_to_temp_folder := True:
    df_SD.to_csv(import_file_location + 'gaspereau_sample_data.csv', index=False)
    df_FD.to_csv(import_file_location + 'gaspereau_fish_details.csv', index=False)
    df_Site.to_csv(import_file_location + 'gaspereau_sites.csv', index=False)
    df_TrapSupervisors.to_csv(import_file_location + 'gaspereau_trap_supervisors.csv', index=False)
    df_LF_grouped.to_csv(import_file_location + 'gaspereau_LF_grouped.csv', index=False)

# After Import Checks

In [78]:
rejected_FD = list(df_FD[df_FD.FLAG_MISNUMBERED_FISH_DETAILS==True].id.unique())
rejected_SD = list(df_SD[df_SD.FLAG_DATETIME==True].id.unique())
rejected_id = rejected_FD + rejected_SD

# pd.DataFrame([rejected_FD, rejected_SD], index=['FD', 'SD']).T.astype('Int64')

In [46]:
# -> 15461 + 7 deleted ghost samples
# how many total samples were imported?
df_SD[df_SD.FLAG_DATETIME.isnull()].shape[0]

15468

In [47]:
# what about LF, FD, both?
(
    df_SD[(df_SD.FLAG_DATETIME.isnull()) & (df_SD.id.isin(df_LF.id.unique()))].shape[0],
    df_SD[(df_SD.FLAG_DATETIME.isnull()) & (df_SD.id.isin(df_FD.id.unique()))].shape[0],
    df_SD[(df_SD.FLAG_DATETIME.isnull()) & (df_SD.id.isin(df_LF.id.unique())) & (df_SD.id.isin(df_FD.id.unique()))].shape[0]
)
# -> 916, 1149 + 7 deleted ghost samples, 820
# yes, meh, yes

(916, 1156, 820)

In [48]:
# why are we missing 7 FD? were they rejected?
# -> YES, exactly 7 were rejected (invalid kwargs)

In [49]:
# without any FD or LF?
df_SD[~(df_SD.id.isin(df_LF.id.unique())) & ~(df_SD.id.isin(df_FD.id.unique())) & (df_SD.FLAG_DATETIME.isnull())].shape[0]

# -> 14216 CORRECT

14216

In [50]:
# -> CORRECT
# how many ghost sites? with LF? FD? both?
(
    df_SD[df_SD.FLAG_GHOST_SAMPLE == True].shape[0],
    df_SD[(df_SD.FLAG_GHOST_SAMPLE == True) & (df_SD.id.isin(df_LF.id.unique()))].shape[0],
    df_SD[(df_SD.FLAG_GHOST_SAMPLE == True) & (df_SD.id.isin(df_FD.id.unique())) & ~(df_SD.id.isin(rejected_id))].shape[0],
    df_SD[(df_SD.FLAG_GHOST_SAMPLE == True) & (df_SD.id.isin(df_LF.id.unique())) & (df_SD.id.isin(df_FD.id.unique())) & ~(df_SD.id.isin(rejected_id))].shape[0]
)
# -> 279 + 7 rejected, 184, 241, 146

(286, 184, 241, 146)

In [61]:
# how many ambiguous samples? 
# flag datetime would get rejected, ghost samples overwrite ambiguity status
df_SD[(df_SD.FLAG_AMBIGUOUS_SAMPLE==True) & (df_SD.FLAG_GHOST_SAMPLE != True) & (df_SD.FLAG_DATETIME != True)].shape[0]

# -> 33

33

In [63]:
# how many have fish details or length frequencies?
df_SD[
    (df_SD.FLAG_AMBIGUOUS_SAMPLE==True) 
    & (df_SD.FLAG_GHOST_SAMPLE != True) 
    & (df_SD.FLAG_DATETIME != True)
    & ((df_SD.id.isin(df_FD.id)) | (df_SD.id.isin(df_LF.id)))
].shape[0]

# -> 4

4