#  ==================================
# IMPORTS
#  ==================================

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# no text wrapping
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', 400)

In [2]:
# import dataframes
df_SD = pd.read_pickle('df_SD.pickle')
df_FD = pd.read_pickle('df_FD.pickle')
df_LF = pd.read_pickle('df_LF.pickle')
df_Site = pd.read_pickle('df_Site.pickle')
df_TrapSupervisors = pd.read_pickle('df_TrapSupervisors.pickle')

### Helper Function to UPDATE ID to Match Updated DATETIME and SITE

In [3]:
def create_new_id(row):
    """only run before disambiguating month portion of ids for FD and LF"""
    return str(row.id)[0:4] + str(row.DATETIME.month).rjust(2,'0') + str(row.DATETIME.day).rjust(2, '0') + str(row.SITE1).rjust(2, '0')

#  ==================================
# FISH DETAILS
#  ==================================

## Combine Age 2 and Age 3 Columns

In [4]:
# check if age2 and age3 occur at the same time
(
    df_FD[df_FD['Ager_2'].notnull() & df_FD['Ager_3'].notnull()].shape[0],
    df_FD[df_FD['AGE_2'].notnull() & df_FD['AGE_3'].notnull()].shape[0],
    df_FD[df_FD['FSP_2'].notnull() & df_FD['FSP_3'].notnull()].shape[0], 
    df_FD[df_FD['Comments_2'].notnull() & df_FD['Comments_3'].notnull()].shape[0]
)

(0, 0, 0, 0)

In [5]:
# how many Ager 2 columns
df_FD[df_FD.Ager_2.notnull() | df_FD.AGE_2.notnull() | df_FD.FSP_2.notnull() | df_FD.Comments_2.notnull()].shape[0]

1683

In [6]:
# how many Ager 3 columns
df_FD[df_FD.Ager_3.notnull() | df_FD.AGE_3.notnull() | df_FD.FSP_3.notnull() | df_FD.Comments_3.notnull()].shape[0]

2339

In [7]:
# combine aging 2 and aging 3 into one set of data
df_FD['Ager_2'] = df_FD['Ager_2'].fillna(df_FD['Ager_3'])
df_FD['AGE_2'] = df_FD['AGE_2'].fillna(df_FD['AGE_3'])
df_FD['FSP_2'] = df_FD['FSP_2'].fillna(df_FD['FSP_3'])
df_FD['Comments_2'] = df_FD['Comments_2'].fillna(df_FD['Comments_3'])

In [8]:
# how many Ager 2 columns
df_FD[df_FD.Ager_2.notnull() | df_FD.AGE_2.notnull() | df_FD.FSP_2.notnull() | df_FD.Comments_2.notnull()].shape[0]

4022

## FLAG_SITE: ambiguous site data

In [9]:
# TODO: if site is changed away from SITE1, update id to match

# df_FD.loc[df_FD.FLAG_SITE==True]

## FLAG_AM_PM_PERIOD: leave as is

## FLAG_SEX: B and A? -> null

In [10]:
df_FD.loc[df_FD.FLAG_SEX==True, 'SEX'] = np.nan

In [11]:
df_FD.loc[:, 'SEX'].unique()

array(['F', 'M', 'U', nan], dtype=object)

## FLAG_MATURITY: 44=4, 0=null

In [12]:
df_FD.loc[df_FD.MATURITY==44, 'MATURITY'] = 4
df_FD.loc[df_FD.MATURITY==0, 'MATURITY'] = np.nan

## FLAG_FSP_1: 33=3

In [13]:
df_FD.loc[df_FD.FSP_1==33, 'FSP_1'] = 3

## FLAG_FL_STD: 10x off

In [14]:
# CONFIRM
df_FD.loc[df_FD.FLAG_FL_STD==True, 'FL_STD'] *= 10
df_FD.loc[df_FD.FLAG_FL_STD==True, 'FL_STD']

index
34891   261.000
34892   253.000
34893   248.000
34894   258.000
34895   261.000
          ...  
35244   240.000
35245   250.000
35246   200.000
35247   204.000
35248   241.000
Name: FL_STD, Length: 264, dtype: float64

## FLAG_FL_WET_FROZEN: 5 typos

In [15]:
# STD doesn't need to be converted here, it will be standardised in dm_apps from the frozen length and frozen condition
df_FD.loc[df_FD.FLAG_FL_WET_FROZEN==True, ['FL_WET', 'FL_STD']] = np.nan
df_FD.loc[df_FD.FLAG_FL_WET_FROZEN==True, 'FL_FROZEN'] = [259, 251, 228, 200, 204]
df_FD.loc[df_FD.FLAG_FL_WET_FROZEN==True, 'fish_length'] = [259, 251, 228, 200, 204]
df_FD.loc[df_FD.FLAG_FL_WET_FROZEN==True, ['CONDITION', 'FL_WET', 'FL_FROZEN', 'FL_STD', 'fish_length']]

Unnamed: 0_level_0,CONDITION,FL_WET,FL_FROZEN,FL_STD,fish_length
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
35048,Frozen,,259,,259
35161,Frozen,,251,,251
35241,Frozen,,228,,228
35246,Frozen,,200,,200
35247,Frozen,,204,,204


## FLAG_WEIGHT_OUTLIER: 

In [16]:
df_FD.loc[df_FD.FLAG_WEIGHT_OUTLIER==True, 'WEIGHT']

index
718      519.000
914      539.000
1348     501.000
1800     512.000
5324     503.000
34988   1934.000
Name: WEIGHT, dtype: float64

In [17]:
# CONFIRM
df_FD.loc[df_FD.WEIGHT>1000, 'WEIGHT'] = np.nan
df_FD.loc[df_FD.WEIGHT>1000, 'WEIGHT']

Series([], Name: WEIGHT, dtype: float64)

## FLAG_GONAD_OUTLIER:

In [18]:
# CONFIRM
df_FD.loc[df_FD.FLAG_GONAD_OUTLIER==True, 'GONAD_WEIGHT'] = np.nan
df_FD.loc[df_FD.FLAG_GONAD_OUTLIER==True, 'GONAD_WEIGHT']

index
5590    NaN
14050   NaN
23051   NaN
23498   NaN
25251   NaN
26652   NaN
32829   NaN
Name: GONAD_WEIGHT, dtype: float64

## FLAG_MULTIPLE_SAMPLE_POSSIBILITIES and FLAG_MISNUMBERED_FISH_DETAILS: 

In [19]:
# df_FD.loc[df_FD.FLAG_MULTIPLE_SAMPLE_POSSIBILITIES==True]

In [20]:
# df_FD.loc[df_FD.FLAG_MISNUMBERED_FISH_DETAILS==True]

## FLAG_LEN_WT_RATIO_OUTLIER: 

In [21]:
# df_FD.loc[df_FD.FLAG_LEN_WT_RATIO_OUTLIER==True, ['FL_STD', 'WEIGHT']]

#  ==================================
# LENGTH FREQUENCIES
#  ==================================

## FLAG_SITE: ambiguous site

In [22]:
# TODO: if site is changed away from SITE1, update id to match

# df_LF.loc[df_LF.FLAG_SITE==True]

## FLAG_AM_PM_PERIOD: leave as is

#  ==================================
# SAMPLES
#  ==================================

## FLAG_DATETIME: null datetime

In [23]:
# TODO: if date is changed, update id to match

# df_SD.loc[df_SD.FLAG_DATETIME==True]

## FLAG_HOURS_FISHED: hours_fished = "maximum "

In [24]:
# CONFIRM
df_SD.loc[df_SD.FLAG_HOURS_FISHED==True, 'hours_fished'] = 18
df_SD.loc[df_SD.FLAG_HOURS_FISHED==True, 'hours_fished']

13695    18
13705    18
13715    18
13725    18
13736    18
13740    18
13748    18
13757    18
13777    18
13780    18
13793    18
13810    18
13823    18
13827    18
13840    18
13856    18
13874    18
13886    18
13893    18
13908    18
13921    18
13925    18
13944    18
13958    18
13961    18
13973    18
13992    18
14002    18
Name: hours_fished, dtype: object

## FLAG_SITE: ambiguous site data

In [25]:
# TODO: if site is updated, update id to match

# these are currently importing as 1A
df_SD.loc[df_SD.FLAG_SITE==True, 'SITE_NO']

5982    1A,8
5999    1A,8
6006    1A,8
6025    1A,8
6036    1A,8
6049    1A,8
6064    1A,8
6092    1A,8
6106    1A,8
6112    1A,8
6118    1A,8
6125    1A,8
6130    1A,8
6138    1A,8
6153    1A,8
6162    1A,8
6193    1A,8
6198    1A,8
6208    1A,8
6215    1A,8
6230    1A,8
6251    1A,8
6256    1A,8
Name: SITE_NO, dtype: object

## FLAG_AM_PM_PERIOD_DISCREPANCIES: FD and LF inconsistent
#### SAMPLES, LENGTH FREQUENCIES, and FISH DETAILS all flagged
* The samples will be where the AM PM data will be stored, so they are the only updates needed, outside of ghost samples, which can be updated as information becomes available.

In [26]:
df_SD.loc[df_SD.FLAG_AM_PM_PERIOD_DISCREPANCIES==True]

Unnamed: 0,DIST,RIVER,NAME,code,GEAR,SITE_NO,no_nets,YEAR,MM,DD,Week,catch_lbs,catch_kg,hours_fished,zone,last_name,comments,bycatch_sbass,bycatch_shad,bycatch_other,DATETIME,SITE1,SITE2,remarks,id,total_fish_preserved,total_fish_measured,AM_PM_PERIOD,wt_lbs,FLAG_DATETIME,FLAG_HOURS_FISHED,FLAG_SITE,FLAG_AM_PM_PERIOD_DISCREPANCIES
5130,2,SWMARG,Daniel Stewart,,81,33,,1993,5,13,3,1100.0,499.0,,upper,STEWART,,,,,1993-05-13,33,,,1993051333,34.0,210.0,AM,115.0,,,,True
8200,2,SWMARG,Pierre Chiasson,,81,26,1.0,2001,6,15,7,500.0,226.8,,lower,Chiasson,,,,,2001-06-15,26,,,2001061526,51.0,283.0,AM,136.0,,,,True
10211,2,SWMARG,Pierre Chiasson,,81,26,1.0,2007,6,2,5,9600.0,4354.5,8.0,lower,Chiasson,,,,,2007-06-02,26,,,2007060226,32.0,290.0,AM,130.0,,,,True
13373,2,SWMARG,Donelda M Gillis,,81,47,1.0,2014,6,10,7,450.0,204.1,7.0,upper,Gillis,,,,,2014-06-10,47,,,2014061047,38.0,246.0,AM,89.0,,,,True


In [27]:
# df_FD.loc[df_FD.FLAG_AM_PM_PERIOD_DISCREPANCIES==True]

In [28]:
# df_LF.loc[df_LF.FLAG_AM_PM_PERIOD_DISCREPANCIES==True]

## FLAG_NO_MATCHING_SAMPLE: no SAMPLE matching LF and/or FD
* these will end up being matched with ghost samples (below)

In [29]:
# df_FD.loc[df_FD.FLAG_NO_MATCHING_SAMPLE==True]

In [30]:
# df_LF.loc[df_LF.FLAG_NO_MATCHING_SAMPLE==True]

## MAKE GHOST SAMPLES
### for unmatched fish details and length frequencies
### NOTE:
Some ambiguous length frequencies and fish details match with eachother. However, we do not know which sample and length frequency are from the same sample. Therefore, we should make sure none of the ambiguous entries ever match automatically, and they are matched manually (eventually) in dm_apps, once that feature is implemented.

Although there are no ambiguous samples that match with length frequencies or fish details, just for throroughness, we should revise id to avoid this possibility as well. 

Therefore, we will arbitrarily add 20 to months in ids of ambiguous fish details, and 40 to months in ids of ambiguous length frequencies. These numbers have no possibility to match a false positive, and they still uniquely identify their date and site.

In [31]:
# to disambiguate ids, add 20 to all months for FD with id>2024000000, add 40 to all months for LF with id>2024000000

# CONFIRM (period is not highly consistent, probably match, but left as False for now)
if match_FD_LF_confirmed_using_period := False:  
    # these are the only ambiguous fish details and length frequencies that appear to match after considering AM/PM period
    matched_period = [3010051926, 4010051926, 3010052726, 4010052726]  # matched using period
    df_FD.loc[(df_FD.id>2024000000) & (~df_FD.id.isin(matched_period)), 'id'] = df_FD.loc[df_FD.id>2024000000, 'id'] + 200000
    df_LF.loc[(df_LF.id>2024000000) & (~df_LF.id.isin(matched_period)), 'id'] = df_LF.loc[df_LF.id>2024000000, 'id'] + 400000
else:
    df_FD.loc[df_FD.id>2024000000, 'id'] = df_FD.loc[df_FD.id>2024000000, 'id'] + 200000
    df_LF.loc[df_LF.id>2024000000, 'id'] = df_LF.loc[df_LF.id>2024000000, 'id'] + 400000

# verified samples are all sample ids before creating ghost samples
verified_samples = set(df_SD.id)

# add null samples where no match exists - Fish Details
df_SD = pd.concat([
    df_SD, 
    df_FD[~df_FD.id.isin(df_SD.id)].drop_duplicates('id')[['id', 'DATETIME', 'SITE1', 'remarks']]
]).reset_index(drop=True)

# add remarks to df_LF for site
df_LF['remarks'] = 'Site: ' + df_LF['site'].fillna('None')

# add null samples where no match exists - Length Frequencies
df_SD = pd.concat([
    df_SD, 
    df_LF[~df_LF.id.isin(df_SD.id)].drop_duplicates('id')[['id', 'DATETIME', 'SITE1', 'remarks']]
]).reset_index(drop=True)

# remove remarks from df_LF, not needed for df_LF
df_LF.drop('remarks', axis=1)

# add a Flag to indicate ghost samples for import
df_SD.loc[~df_SD.id.isin(verified_samples), 'FLAG_GHOST_SAMPLE'] = True

# add remarks about ghost samples
df_SD.loc[~df_SD.id.isin(verified_samples), 'remarks'] = df_SD.loc[~df_SD.id.isin(verified_samples), 'remarks'] + '; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies'

In [32]:
df_SD.tail(10).loc[~df_SD.id.isin(verified_samples), ['id', 'SITE1', 'remarks']]

Unnamed: 0,id,SITE1,remarks
15506,2016053100,,Site: None; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15507,2016061400,,Site: None; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15508,2016062100,,Site: None; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15509,1994060417,17.0,Site: 17; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15510,1994051438,38.0,"Site: 38,41; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies"
15511,1994051149,49.0,"Site: 49,33; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies"
15512,2018052900,,Site: None; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15513,2018060800,,Site: None; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15514,3018461364,64.0,Site: 64; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15515,2018062700,,Site: None; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies


In [33]:
# ghost samples
df_SD[df_SD.remarks.str.contains('GHOST SAMPLE').fillna(False)].sample(10)[['id', 'SITE1', 'remarks']]

Unnamed: 0,id,SITE1,remarks
15271,1993052417,17,SITE_notes: 17; AGE_notes_1: 3; FSP_notes_1: 3; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15322,1995061041,41,SITE_notes: 41; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15430,2021052025,25,SITE_notes: 25; Ager_2: LF; AGE_notes_2: 3; FSP_notes_2: 3; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15405,2018051515,15,SITE_notes: 15; Ager_1: JM; Ager_3: Jmac; Comments_1: NICE; AGE_notes_1: 4; AGE_notes_3: 3; FSP_notes_3: 3; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15442,2021060392,92,SITE_notes: Eric Mac; Ager_2: LF; AGE_notes_2: 3; FSP_notes_2: 3; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15293,1994052733,33,"SITE_notes: 33,41,60; AGE_notes_1: 4; FSP_notes_1: 4; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies"
15353,3000260926,26,SITE_notes: 26; FSP_notes_1: 0; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15502,4010451926,26,Site: 26; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15486,3000460926,26,Site: 26; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15454,2021061741,41,SITE_notes: 41; Ager_2: LF; AGE_notes_2: 5; FSP_notes_2: 3; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies


## REMERGE JOINED DATA
(previously merged without ghost samples, remerge with ghost data)

In [34]:
# drop previously merged columns (to be re-merged)
df_SD = df_SD.drop(['total_fish_preserved', 'total_fish_measured', 'AM_PM_PERIOD', 'wt_lbs'], axis=1)

# JOIN with Fish Details table to get total_fish_preserved
# NOTE: this is an estimate, assuming all fish details are accounted for. This is the best information available
df_SD = pd.merge(
    df_SD, 
    pd.merge(df_SD, df_FD, on='id').groupby('id').count()['FISH_NO'].reset_index(),
    on='id',
    how='left'
).rename({'FISH_NO':'total_fish_preserved'}, axis=1)

# JOIN with Length Frequencies table to get total_fish_measured
# NOTE: this is an estimate, assuming all length frequencies are accounted for. This is the best information available
df_SD = pd.merge(
    df_SD, 
    pd.merge(df_SD, df_LF, on='id').groupby('id').sum()['freq'].reset_index(),
    on='id',
    how='left'
).rename({'freq':'total_fish_measured'}, axis=1)

# # JOIN with Fish Details and Length Frequencies to get AM_PM_PERIOD 
# # NOTE: discrepancies flagged between Length Frequencies and Fish Details
df_period = pd.merge(
    df_FD[df_FD.PERIOD.notnull()].groupby('id').first().reset_index()[['id', 'PERIOD']],
    df_LF[df_LF.period.notnull()].groupby('id').first().reset_index()[['id', 'period']], 
    on='id',
    how='outer'
)
df_period['AM_PM_PERIOD'] = df_period['PERIOD'].fillna(df_period['period'])

df_SD = pd.merge(
    df_SD, 
    df_period[['id', 'AM_PM_PERIOD']],
    on='id',
    how='left'
)

# JOIN with Length Frequency table to get sample weight
df_SD = pd.merge(
    df_SD, 
    df_LF[['id', 'wt_lbs']],
    on='id', 
    how='left'  # all samples 
).drop_duplicates().reset_index(drop=True)

# Fix Datetime Format

In [35]:
# some times merged into DATETIME from other df via ghost samples () 
# reduce to date and AM / PM period
df_SD['DATETIME'] = pd.to_datetime(df_SD['DATETIME'].dt.date)

#  ==================================
# LF Grouped
#  ==================================

## Recreate df_LF_grouped with new ambiguous ids for LF entries (+40 to month if ambiguous)

In [36]:
# group by length bins, include only imported columns 
df_LF_grouped = df_LF.groupby(['id', 'length_bin_id']).sum('freq').reset_index()[['freq', 'length_bin_id', 'id']].rename({
    'freq': 'count', 
    'length_bin_id': 'length_bin_id', 
    'id': 'sample_id'
}, axis=1).reset_index(drop=True)[['sample_id', 'length_bin_id', 'count']]

In [37]:
# ambiguous entries, month bit (+40 to disambiguate)
min([int(str(x)[4:6]) for x in list(df_LF_grouped[df_LF_grouped.sample_id>2024000000].sample_id.unique())])

45

In [38]:
# regular entries, month bit
max([int(str(x)[4:6]) for x in list(df_LF_grouped[df_LF_grouped.sample_id<2024000000].sample_id.unique())])

6

#  ==================================
# COMMENTS
#  ==================================

CONVERT E.MACFARLANE, Eric Mac, Eric MacFarlane, J McFarlane to John Eric MacFarlane.

JA Coady keep as is

Multiple sites, keep as is. 

I see 118 samples from 1989 that are associated with ‘blank’. Can you convert ‘blank’ to Unknown and upload.


In [39]:
list(df_FD[df_FD.SITE.isna()].id.unique())
# Yes, these are imported as ghost samples

[1989060100, 1989060200, 1989060700]

In [40]:
FD_sites_id = df_FD[(~df_FD['SITE'].fillna(0).astype(str).str.isnumeric()) & (df_FD.SITE!='1A') & (df_FD.SITE!='1B')].id.unique()

In [41]:
# here are our troublesome sites
df_FD[df_FD.id.isin(FD_sites_id)].groupby('id').first()[['DATETIME', 'SITE', 'SITE1']]

Unnamed: 0_level_0,DATETIME,SITE,SITE1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1993052060,1993-05-20,"60, 52",60
1993052301,1993-05-23,"1, 8",1
1993052437,1993-05-24,373835,37
1993052660,1993-05-26,6052,60
1993052701,1993-05-27,18,1
1993060305,1993-06-03,58,5
1994051012,1994-05-10,1217,12
1994051115,1994-05-11,1517,15
1994051133,1994-05-11,334849,33
1994051411,1994-05-14,1112,11


In [67]:
# what about LF?
LF_sites_id = df_LF[(~df_LF['site'].fillna(0).astype(str).str.isnumeric()) & (df_LF.site!='1A') & (df_LF.site!='1B')].id.unique()
df_LF[df_LF.id.isin(LF_sites_id)].groupby('id').first()[['DATETIME', 'site', 'SITE1']]

Unnamed: 0_level_0,DATETIME,site,SITE1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1991060129,1991-06-01,2930,29
1993051117,1993-05-11,1733,17
1993052301,1993-05-23,18,1
1993052435,1993-05-24,353738,35
1993052701,1993-05-27,18,1
1993060305,1993-06-03,58,5
1993060352,1993-06-03,5260,52
1994051012,1994-05-10,1217,12
1994051115,1994-05-11,1517,15
1994051149,1994-05-11,4933,49


In [68]:
# note: 93 and 92 are both 'John Eric MacFarlane' - same for FD and LF
# append these and update the import script
df_Site = pd.concat([
    df_Site,
    pd.DataFrame([['John Eric MacFarlane'], ['JA Coady']], columns=['site'])
], axis=0).reset_index(drop=True)

### let's make a list of all slightly dubious matches (inexact sites)

In [70]:
SD_sites_id = df_SD[(~df_SD['SITE_NO'].fillna(0).astype(str).str.isnumeric()) & (df_SD.SITE_NO!='1A') & (df_SD.SITE_NO!='1B')].id.unique()

In [79]:
dubious_sites_id = sorted([x for x in set(list(LF_sites_id)+list(FD_sites_id)+list(SD_sites_id))])
len(dubious_sites_id)

105

In [80]:
# NOTES
# the only ambiguous site for samples is 1A,8
# half of these are ghost samples
df_SD[df_SD.id.isin(dubious_sites_id)]

Unnamed: 0,DIST,RIVER,NAME,code,GEAR,SITE_NO,no_nets,YEAR,MM,DD,Week,catch_lbs,catch_kg,hours_fished,zone,last_name,comments,bycatch_sbass,bycatch_shad,bycatch_other,DATETIME,SITE1,SITE2,remarks,id,total_fish_preserved,total_fish_measured,AM_PM_PERIOD,wt_lbs,FLAG_DATETIME,FLAG_HOURS_FISHED,FLAG_SITE,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_GHOST_SAMPLE
5250,2.0,SWMARG,Anthony Cameron,,81.0,5,,1993.0,6.0,3.0,6.0,1200.0,544.3,,lower,Cameron,,,,,1993-06-03,5,,,1993060305,28.0,231.0,AM,117.0,,,,,
5256,2.0,SWMARG,Catherine MacFarlane,,81.0,52,,1993.0,6.0,3.0,6.0,600.0,272.2,7.5,upper,MacFarlane,,,,,1993-06-03,52,,,1993060352,,220.0,PM,119.0,,,,,
5263,2.0,SWMARG,Norman MacNeil,,81.0,60,,1993.0,5.0,20.0,4.0,30.0,13.6,,upper,MacNeil,,,,,1993-05-20,60,,,1993052060,30.0,259.0,AM,132.0,,,,,
5326,2.0,SWMARG,Marilyn Gillis,,81.0,1,,1993.0,5.0,23.0,5.0,3000.0,1360.8,,lower,Gillis,,,,,1993-05-23,1,,,1993052301,28.0,213.0,PM,109.0,,,,,
5347,2.0,SWMARG,Florence Gillis,,81.0,37,,1993.0,5.0,24.0,5.0,450.0,204.1,7.5,upper,Gillis,,,,,1993-05-24,37,,,1993052437,30.0,,AM,,,,,,
5405,2.0,SWMARG,Norman MacNeil,,81.0,60,,1993.0,5.0,26.0,5.0,100.0,45.4,,upper,MacNeil,,,,,1993-05-26,60,,,1993052660,27.0,252.0,PM,127.0,,,,,
5406,2.0,SWMARG,Marilyn Gillis,,81.0,1,,1993.0,5.0,27.0,5.0,8000.0,3628.7,,lower,Gillis,,,,,1993-05-27,1,,,1993052701,25.0,239.0,PM,116.0,,,,,
5602,2.0,SWMARG,Martin E Cameron,,81.0,12,1.0,1994.0,5.0,10.0,2.0,600.0,272.2,14.0,lower,Cameron,,,,,1994-05-10,12,,,1994051012,25.0,172.0,PM,96.0,,,,,
5651,2.0,SWMARG,Darlene Cameron,,81.0,11,1.0,1994.0,5.0,14.0,2.0,300.0,136.1,12.0,lower,Cameron,,,,,1994-05-14,11,,,1994051411,29.0,,AM,,,,,,
5671,2.0,SWMARG,Anthony Cameron,,81.0,5,1.0,1994.0,5.0,16.0,3.0,900.0,408.2,16.0,lower,Cameron,,,,,1994-05-16,5,,,1994051605,30.0,,AM,,,,,,


In [None]:
# check LF and FD
# maybe merge them all to see if the inconsistencies are dubious or obvious







#  ==================================
# SAVE DATA
#  ==================================

In [42]:
# reorder columns for clarity - all flags at the end
df_SD = df_SD[[x for x in list(df_SD.columns) if 'FLAG' not in str(x)] + [x for x in list(df_SD.columns) if 'FLAG' in str(x)]]
df_FD = df_FD[[x for x in list(df_FD.columns) if 'FLAG' not in str(x)] + [x for x in list(df_FD.columns) if 'FLAG' in str(x)]]
df_LF = df_LF[[x for x in list(df_LF.columns) if 'FLAG' not in str(x)] + [x for x in list(df_LF.columns) if 'FLAG' in str(x)]]

In [43]:
# save pickle files of dataframes
if save_pickles := False:
    df_SD.to_pickle('df_SD_cleaned.pickle')
    df_FD.to_pickle('df_FD_cleaned.pickle')
    df_LF.to_pickle('df_LF_cleaned.pickle')
    df_LF_grouped.to_pickle('df_LF_grouped_cleaned.pickle')
    df_Site.to_pickle('df_Site_cleaned.pickle')
    df_TrapSupervisors.to_pickle('df_TrapSupervisors_cleaned.pickle')

# save files to this workbook drive
if save_csv := False:
    df_SD.to_csv('gaspereau_sample_data_cleaned.csv', index=False)
    df_FD.to_csv('gaspereau_fish_details_cleaned.csv', index=False)
    df_LF.to_csv('gaspereau_length_frequencies_cleaned.csv', index=False)
    df_Site.to_csv('gaspereau_sites_cleaned.csv', index=False)
    df_TrapSupervisors.to_csv('gaspereau_trap_supervisors_cleaned.csv', index=False)
    df_LF_grouped.to_csv('gaspereau_LF_grouped_cleaned.csv', index=False)
    
# upload to temp folder for import into Kevin's local dm_apps
import_file_location = 'C:\\Users\\CARRK\\Documents\\Repositories\\dm_app_root\\dm_apps\\herring\\temp\\'

if upload_csv_to_temp_folder := False:
    df_SD.to_csv(import_file_location + 'gaspereau_sample_data.csv', index=False)
    df_FD.to_csv(import_file_location + 'gaspereau_fish_details.csv', index=False)
    df_Site.to_csv(import_file_location + 'gaspereau_sites.csv', index=False)
    df_TrapSupervisors.to_csv(import_file_location + 'gaspereau_trap_supervisors.csv', index=False)
    df_LF_grouped.to_csv(import_file_location + 'gaspereau_LF_grouped.csv', index=False)

# Check Import Issues

In [44]:
checkthese = [
    1993052010, 1993053010, 2016053100, 2016061400, 2016062100, 2017053092, 2017060692, 2018052900, 2018060800, 
    2018062700, 2019060794, 2019060793, 2019061493, 2019062594, 1989060100, 1989060200, 1989060700, 2011060414, 
    2016053195, 2016061495, 2016062195, 2018052992, 2018062792, 2018060892, 2021052092, 2021060192, 2021060392, 
    2021061592, 2021062392
]

# these were rejected then recreated
len(list(df_SD[df_SD.id.isin(checkthese)].id)), len(checkthese)

(29, 29)

In [45]:
# why were they rejected
df_SD[df_SD.id.isin(checkthese)].SITE1.unique()

array([<NA>, 14, 95, 92, 93, 94, '10'], dtype=object)

In [46]:
# none of the sites match
df_Site.site.unique()

array([1, '1A', '1B', 2, 4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 18, 19, 20, 21,
       23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 38, 39, 40,
       41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59,
       60, 61, 62, 63, 64, 65, 66, 67, 68], dtype=object)

In [47]:
# SOLUTION: if a site is unmatched, make it None, don't reject it
# didn't happen, but similar for trap_supervisor not matching FK table

# After Import Checks

In [48]:
rejected_FD = list(df_FD[df_FD.FLAG_MISNUMBERED_FISH_DETAILS==True].id.unique())
rejected_SD = list(df_SD[df_SD.FLAG_DATETIME==True].id.unique())
rejected_id = rejected_FD + rejected_SD

In [49]:
# -> CORRECT
# how many total samples were imported?
df_SD[df_SD.FLAG_DATETIME.isnull()].shape[0]

15485

In [50]:
# what about LF, FD, both?
(
    df_SD[(df_SD.FLAG_DATETIME.isnull()) & (df_SD.id.isin(df_LF.id.unique()))].shape[0],
    df_SD[(df_SD.FLAG_DATETIME.isnull()) & (df_SD.id.isin(df_FD.id.unique()))].shape[0],
    df_SD[(df_SD.FLAG_DATETIME.isnull()) & (df_SD.id.isin(df_LF.id.unique())) & (df_SD.id.isin(df_FD.id.unique()))].shape[0]
)
# -> 916, 1149, 803
# yes, meh, yes

(916, 1156, 803)

In [51]:
# -> CORRECT
# why are we missing 7 FD? were they rejected?
# -> YES, exactly 7 were rejected (invalid kwargs)

In [52]:
# without any FD or LF?
df_SD[~(df_SD.id.isin(df_LF.id.unique())) & ~(df_SD.id.isin(df_FD.id.unique())) & (df_SD.FLAG_DATETIME.isnull())].shape[0]
# -> 14223
# close, but how are there more in dm_apps?

14216

In [53]:
# maybe the rejected FD were still imported
rejected_FD = [2993252737, 2993252952, 2993260933, 2998250105, 3010252925, 3010250541, 3010261141] # all of these samples were imported as ghost samples
df_SD[~(df_SD.id.isin(df_LF.id.unique())) & (~(df_SD.id.isin(df_FD.id.unique())) | df_SD.id.isin(rejected_FD)) & (df_SD.FLAG_DATETIME.isnull())].shape[0]
# -> CORRECT (yuck code, but it matches)

14223

In [54]:
# -> CORRECT
# how many ambiguous samples?  6
# do these match with FD or LF?  No. (correct)
df_SD[~(df_SD.FLAG_GHOST_SAMPLE == True) & (df_SD.id > 2024000000) & (df_SD.FLAG_DATETIME.isnull())][['DATETIME']+list(df_SD.columns)]

Unnamed: 0,DATETIME,DIST,RIVER,NAME,code,GEAR,SITE_NO,no_nets,YEAR,MM,DD,Week,catch_lbs,catch_kg,hours_fished,zone,last_name,comments,bycatch_sbass,bycatch_shad,bycatch_other,DATETIME.1,SITE1,SITE2,remarks,id,total_fish_preserved,total_fish_measured,AM_PM_PERIOD,wt_lbs,FLAG_DATETIME,FLAG_HOURS_FISHED,FLAG_SITE,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_GHOST_SAMPLE
2452,1988-05-23,2.0,SWMARG,Michael D Gillis,,81.0,48,,1988,5,23,,7015.0,3182.0,13.0,upper,Gillis,,,,,1988-05-23,48,,,2988052348,,,,,,,,,
2535,1988-05-23,2.0,SWMARG,Michael D Gillis,,81.0,48,,1988,5,23,,4008.0,1818.0,13.0,upper,Gillis,,,,,1988-05-23,48,,,3988052348,,,,,,,,,
6514,1997-06-12,2.0,SWMARG,Michael D Gillis,,81.0,58,1.0,1997,6,12,7.0,900.0,408.2,,upper,Gillis,,,,,1997-06-12,58,,,2997061258,,,,,,,,,
6515,1997-06-12,2.0,SWMARG,Michael D Gillis,,81.0,58,1.0,1997,6,12,7.0,200.0,90.7,,upper,Gillis,,,,,1997-06-12,58,,,3997061258,,,,,,,,,
9176,2004-06-10,2.0,SWMARG,Charles McDaniel,,81.0,1,1.0,2004,6,10,7.0,1150.0,521.6,15.0,lower,McDaniel,,,,,2004-06-10,1,,,3004061001,,,,,,,,,
9177,2004-06-10,2.0,SWMARG,Charles McDaniel,,81.0,1,1.0,2004,6,10,7.0,1125.0,510.3,6.5,lower,McDaniel,,,,,2004-06-10,1,,,4004061001,,,,,,,,,


In [55]:
# -> CORRECT
# how many ghost sites? with LF? FD? both?
(
    df_SD[df_SD.FLAG_GHOST_SAMPLE == True].shape[0],
    df_SD[(df_SD.FLAG_GHOST_SAMPLE == True) & (df_SD.id.isin(df_LF.id.unique()))].shape[0],
    df_SD[(df_SD.FLAG_GHOST_SAMPLE == True) & (df_SD.id.isin(df_FD.id.unique())) & ~(df_SD.id.isin(rejected_id))].shape[0],
    df_SD[(df_SD.FLAG_GHOST_SAMPLE == True) & (df_SD.id.isin(df_LF.id.unique())) & (df_SD.id.isin(df_FD.id.unique())) & ~(df_SD.id.isin(rejected_id))].shape[0]
)
# -> 303, 185, 241, 130

(303, 185, 241, 130)