In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# no text wrapping
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', 400)

# OBSOLETE

# ======================
# FORMATING FOR IMPORT
# ======================

* ALLCAPS: denotes columns that aren't perfect matches
* lowercase: denotes columns that should match perfectly with herring tables in dm_apps

### LENGTH FREQUENCY 
### FINAL TABLE FOR IMPORT INTO DM_APPS

In [98]:
gaspereau_lengthfrequency = df_LF.groupby(['id', 'length_bin_id']).sum('freq').reset_index()[['freq', 'length_bin_id', 'id']].rename({
    'freq': 'count', 
    'length_bin_id': 'length_bin_id', 
    'id': 'sample_id'
}, axis=1)

### FISH DETAILS
### FINAL TABLE FOR IMPORT INTO DM_APPS

In [101]:
# ALL CAPS used for unmatched fields, lowercase match id from dm_apps
rename_FD = {
    'FISH_NO': 'fish_number', 'FL_STD': 'fish_length', 'WEIGHT': 'fish_weight', 'MATURITY': 'maturity_id', 'GONAD_WEIGHT': 'gonad_weight',
    'Envelop.Comments': 'ENVELOP_COMMENTS',
    'Ager_1': 'AGER_1', 'Comments_1': 'COMMENTS_1', 
    'Ager_2': 'AGER_2', 'Comments_2': 'COMMENTS_2', 
    'Ager_3': 'AGER_3', 'Comments_3': 'COMMENTS_3', 
}

gaspereau_fishdetails = df_FD[[
    'sample_id', 'DATETIME', 'SITE', 'PERIOD', 'CONDITION', 'FISH_NO', 
    'FL_WET', 'FL_FROZEN', 'FL_STD', 'WEIGHT', 'SPECIES', 'SEX', 'MATURITY', 'GONAD_WEIGHT', 
    'Ager_1', 'AGE_1', 'FSP_1', 'Comments_1', 
    'Ager_2', 'AGE_2', 'FSP_2', 'Comments_2', 
    'Ager_3', 'AGE_3', 'FSP_3', 'Comments_3', 
    'Envelop.Comments', 'remarks'
]].rename(columns=rename_FD)

# sort in numerical order - sample_id (date), then fish_number
gaspereau_fishdetails = gaspereau_fishdetails.sort_values(['sample_id','fish_number']).reset_index(drop=True)

### SAMPLE DATA
### FINAL TABLE FOR IMPORT INTO DM_APPS

In [107]:
# ALL CAPS FOR UNMATCHED
rename_SD = {
    'id': 'sample_id',  # rename id to be sample_id to be consistent with fish detail and length frequencies
    'DATETIME': 'sample_date', 'DIST': 'district_id', 
    'NAME': 'SAMPLER', 'code': 'CODE', 'GEAR': 'gear_id', 'SITE1': 'SITE',
    'no_nets': 'NO_NETS', 'catch_lbs': 'catch_weight_lbs', 'hours_fished': 'HOURS_FISHED', 
    'zone': 'ZONE', 'last_name': 'LAST_NAME', 'comments': 'COMMENTS',
    'bycatch_sbass': 'BYCATCH_SBASS', 'bycatch_shad': 'BYCATCH_SHAD', 'bycatch_other': 'BYCATCH_OTHER', 
    'wt_lbs': 'sample_weight_lbs'
}

gaspereau_sample = df_SD.rename(columns=rename_SD)

# SAVE TABLES

In [108]:
# TODO: revise to output questionable and verified data

write_new_files = False

# save pickle files
if write_new_files:  # change this to overwrite/resave
    gaspereau_lengthfrequency.to_pickle('gaspereau_lengthfrequency.pickle')
    gaspereau_fishdetails.to_pickle('gaspereau_fishdetails.pickle')
    gaspereau_sample.to_pickle('gaspereau_sample.pickle')
    gaspereau_TrapSupervisors.to_pickle('gaspereau_TrapSupervisors.pickle')
    
# save csv for import into django
if write_new_files:  # change this to overwrite/resave
    import_file_location = 'C:\\Users\\CARRK\\Documents\\Repositories\\dm_app_root\\dm_apps\\herring\\temp\\'
    
    gaspereau_lengthfrequency.to_csv(import_file_location + 'gaspereau_lengthfrequency.csv')
    gaspereau_fishdetails.to_csv(import_file_location + 'gaspereau_fishdetails.csv')
    gaspereau_sample.to_csv(import_file_location + 'gaspereau_sample.csv')
    gaspereau_TrapSupervisors.to_csv(import_file_location + 'gaspereau_TrapSupervisors.csv')

In [115]:
df_SD.head()

Unnamed: 0,DIST,RIVER,NAME,code,GEAR,SITE_NO,no_nets,YEAR,MM,DD,Week,catch_lbs,catch_kg,hours_fished,zone,last_name,comments,bycatch_sbass,bycatch_shad,bycatch_other,FLAG_HOURS_FISHED,FLAG_SITE,SITE1,SITE2,DATETIME,FLAG_DATETIME,remarks,id,total_fish_preserved,total_fish_measured
0,2,SWMARG,Darlene Cameron,,81,11,,1983,5,17,,18038.0,8181.9,14,lower,Cameron,,,,,,,11,,1983-05-17,,,1983051711,,
1,2,SWMARG,Martin E Cameron,,81,12,,1983,5,16,,18038.0,8181.9,15,lower,Cameron,,,,,,,12,,1983-05-16,,,1983051612,100.0,
2,2,SWMARG,Martin E Cameron,,81,12,,1983,5,11,,17036.0,7727.4,14,lower,Cameron,,,,,,,12,,1983-05-11,,,1983051112,,
3,2,SWMARG,Martin E Cameron,,81,12,,1983,5,18,,15032.0,6818.4,15,lower,Cameron,,,,,,,12,,1983-05-18,,,1983051812,,
4,2,SWMARG,Martin E Cameron,,81,12,,1983,5,17,,13027.0,5908.9,15,lower,Cameron,,,,,,,12,,1983-05-17,,,1983051712,50.0,


In [116]:
df_FD.head()

Unnamed: 0,DATETIME,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,FLAG_SITE,SITE_notes,SITE1,SITE2,SITE3,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_LEN_WT_RATIO_OUTLIER,remarks,sample_id,id_2,id_3
0,1983-05-12,1983,5,12,,12,,Fresh,1,288,,288.0,363.0,A,F,2,,,5,3,,,,,,,,,,,5,3,,,,,,12,12,,,,,,,,,,,,SITE_notes: 12; AGE_notes_1: 5; FSP_notes_1: 3,1983051212,,
1,1983-05-12,1983,5,12,,12,,Fresh,10,251,,251.0,227.0,A,F,2,,,3,3,,,,,,,,,,,3,3,,,,,,12,12,,,,,,,,,,,,SITE_notes: 12; AGE_notes_1: 3; FSP_notes_1: 3,1983051212,,
2,1983-05-12,1983,5,12,,12,,Fresh,11,247,,247.0,214.0,A,M,2,,,3,3,,,,,,,,,,,3,3,,,,,,12,12,,,,,,,,,,,,SITE_notes: 12; AGE_notes_1: 3; FSP_notes_1: 3,1983051212,,
3,1983-05-12,1983,5,12,,12,,Fresh,12,287,,287.0,374.0,A,F,2,,,5,4,,,,,,,,,,,5,4,,,,,,12,12,,,,,,,,,,,,SITE_notes: 12; AGE_notes_1: 5; FSP_notes_1: 4,1983051212,,
4,1983-05-12,1983,5,12,,12,,Fresh,13,264,,264.0,243.0,A,M,2,,,4,4,,,,,,,,,,,4,4,,,,,,12,12,,,,,,,,,,,,SITE_notes: 12; AGE_notes_1: 4; FSP_notes_1: 4,1983051212,,


In [117]:
df_LF.head()

Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,FLAG_SITE,SITE1,SITE2,SITE3,FLAG_PERIOD,id,new_sample,new_id,length_bin_id
0,1990,5,7,,,2,12,LOWER,PM,,,250,1,250,1990-05-07,,12,,,,1990050712,False,,25.0
1,1990,5,7,,,2,12,LOWER,PM,,,253,1,250,1990-05-07,,12,,,,1990050712,False,False,25.0
2,1990,5,7,,,2,12,LOWER,PM,,,255,2,255,1990-05-07,,12,,,,1990050712,False,False,25.5
3,1990,5,7,,,2,12,LOWER,PM,,,258,2,255,1990-05-07,,12,,,,1990050712,False,False,25.5
4,1990,5,7,,,2,12,LOWER,PM,,,260,7,260,1990-05-07,,12,,,,1990050712,False,False,26.0


# TODO (later if required): better matching?
* TODO: 
    * confirm this is a useful approach (or can we fix the data)
    * if good, redo for LF as well

In [109]:
df_FD.loc[df_FD.SITE2.notna(), 'id_2'] = df_FD.loc[df_FD.SITE2.notna(), 'sample_id']
df_FD.loc[df_FD.SITE2.notna(), 'id_3'] = df_FD.loc[df_FD.SITE3.notna(), 'sample_id']

df_FD.loc[df_FD.SITE2.notna(), 'id_2'] = df_FD.loc[df_FD.SITE2.notna(), 'id_2'] + df_FD.loc[df_FD.id_2.notna(), 'SITE2'].str.strip(')').astype(int) - df_FD.loc[df_FD.SITE2.notna(), 'SITE'].astype(int)
df_FD.loc[df_FD.SITE2.notna(), 'id_3'] = df_FD.loc[df_FD.SITE2.notna(), 'id_3'] + df_FD.loc[df_FD.id_3.notna(), 'SITE3'].astype(int) - df_FD.loc[df_FD.SITE2.notna(), 'SITE'].astype(int)

# df_FD[df_FD.SITE2.notna()][['SITE', 'SITE1', 'SITE2', 'SITE3', 'sample_id', 'id_2', 'id_3']]

In [110]:
SD_ids = list(df_SD.id.unique())
FD_ids = list(df_FD.sample_id.unique())
FD_extra_ids = list(df_FD[df_FD.id_2.notna()].id_2.unique()) + list(df_FD[df_FD.id_3.notna()].id_3.unique())

### how many unmatched samples are fixed with this method

In [111]:
# matched with previous method
len([x for x in SD_ids if x in FD_ids])

908

In [112]:
# matched with improved method
len([x for x in SD_ids if x in FD_extra_ids])

28

In [113]:
# now check if those SD have multiple sites
newly_found_SD = sorted([x for x in SD_ids if x in FD_extra_ids])

# only NA in this set -> no ambiguity in site data from samples
df_SD[df_SD.id.isin(newly_found_SD)]['SITE2'].unique()

# none of these are ambiguous data

array([<NA>], dtype=object)