In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# no text wrapping
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 200)

In [2]:
df_hist = pd.read_csv(r'.\csv\trapnet_biologicaldetailing_202302231329.csv', low_memory=False)
df_spec = pd.read_csv(r'.\csv\trapnet_specimen_202302231329.csv', low_memory=False)

# historical data without trivial fields (fields with only one entry)
unmatchable_hist = ['created_at', 'updated_at']
df_hist = df_hist.drop(unmatchable_hist, axis=1).loc[:, df_hist.nunique() > 1]
hist_sample_id_list = sorted(list(df_hist.sample_id.unique()))

# all species in historical data are 79 (atlantic salmon)
unmatchable_spec = ['created_at', 'updated_at', 'created_by_id', 'updated_by_id']
df_spec = df_spec[(df_spec.species_id==79) & (df_spec.sample_id.isin(hist_sample_id_list))].reset_index(drop=True)
df_spec = df_spec.drop(unmatchable_spec, axis=1).loc[:, df_spec.nunique() > 1]

In [3]:
# cast to int where appropriate
df_hist[['fork_length', 'sex_id']] = df_hist[['fork_length', 'sex_id']].astype('Int64')
df_spec[['fork_length', 'sex_id']] = df_spec[['fork_length', 'sex_id']].astype('Int64')

In [4]:
# archived fish data
df_archive = pd.read_csv(r'.\csv\fish_data.csv', low_memory=False)
df_archive['DATETIME'] = pd.to_datetime(df_archive['SITE_EVENT_DATE'].str.split(' ', expand=True)[0], format='%d/%m/%Y')
df_archive['old_id'] = 'GD_' + df_archive['GD_ID'].astype(str)
df_archive = df_archive[['old_id', 'DATETIME'] + [x for x in df_archive.columns if x not in ['SITE_EVENT_DATE', 'DATETIME', 'old_id']]]

In [5]:
# matching id

df_hist['matching_id'] = df_hist['sample_id'].astype(str) + df_hist['fork_length'].astype(str) + df_hist['weight'].fillna(0).astype(str) + df_hist['sex_id'].fillna(0).astype(str)
df_hist['matching_id'] = df_hist['matching_id'].str.replace('.', '_', regex=False)
df_hist['distinct'] = ~df_hist.matching_id.isin(df_hist[df_hist.matching_id.duplicated()].matching_id.unique())

df_spec['matching_id'] = df_spec['sample_id'].astype(str) + df_spec['fork_length'].astype(str) + df_spec['weight'].fillna(0).astype(str) + df_spec['sex_id'].fillna(0).astype(str)
df_spec['matching_id'] = df_spec['matching_id'].str.replace('.', '_', regex=False)
df_spec['distinct'] = ~df_spec.matching_id.isin(df_spec[df_spec.matching_id.duplicated()].matching_id.unique())

spec_ids = list(df_spec.matching_id.unique())
hist_ids = list(df_hist.matching_id.unique())

matches = [x for x in spec_ids if x in hist_ids]

# differences between archive and biologicaldetailing
#### conclusions: old_id <-> GD_ID correctly and adequately match samples between the database and the archive

In [6]:
df_merged = pd.merge(df_hist, df_archive, on='old_id')
first_columns = ['sample_id', 'old_id', 'DATETIME']
df_merged = df_merged[first_columns + [x for x in sorted(list(df_merged.columns), key=str.lower) if x not in first_columns]]

In [7]:
# need to join with sample to get dates for comparison
# SAME NUMBER OF ROWS!!!
df_merged.shape, pd.read_csv(r'.\csv\trapnet_biologicaldetailing_202302231329.csv', low_memory=False).shape

((27524, 45), (27524, 23))

In [8]:
df_merged.SURVEY_TYPE.value_counts()

1    27351
3      173
Name: SURVEY_TYPE, dtype: int64

In [9]:
# all atlantic salmon
df_merged.SPECIES_ITIS_CODE.value_counts()

161996    27524
Name: SPECIES_ITIS_CODE, dtype: int64

In [10]:
# about 10x more salmon in the archive file
df_archive[df_archive.SPECIES_ITIS_CODE=='161996'].SPECIES_ITIS_CODE.value_counts()

161996    256154
Name: SPECIES_ITIS_CODE, dtype: int64

In [11]:
comparisons = {
    'age_type': 'AGE_TYPE',
    'fork_length': 'FORK_LENGTH',
    'life_stage_id': 'MATURITY', 
    'sex_id': 'SEX',
    'total_length': 'TOTAL_LENGTH',
    'weight': 'WEIGHT',
    'river_age': 'RIVER_AGE'
}

In [12]:
i = 6
k = list(comparisons.keys())[i] 
v = comparisons[k]
df_merged[first_columns + [k, v]]  # how to check if these are always matched? do it individually

Unnamed: 0,sample_id,old_id,DATETIME,river_age,RIVER_AGE
0,4390,GD_1,1968-08-09,0.000,0.000
1,4390,GD_2,1968-08-09,0.000,0.000
2,4390,GD_3,1968-08-09,0.000,0.000
3,4390,GD_4,1968-08-09,0.000,0.000
4,4390,GD_5,1968-08-09,0.000,0.000
...,...,...,...,...,...
27519,8001,GD_207005,2000-07-12,0.000,0.000
27520,8001,GD_207006,2000-07-12,0.000,0.000
27521,8001,GD_207011,2000-07-12,1.000,1.000
27522,8001,GD_207012,2000-07-12,1.000,1.000


In [13]:
# Age Type is always the same
df_merged[df_merged.AGE_TYPE.fillna(0) != df_merged.age_type.map({1:'SCALE', 2:'LGTHFREQ'}).fillna(0)]

Unnamed: 0,sample_id,old_id,DATETIME,age_type,AGE_TYPE,BIOLOGICAL_REMARKS,BIOLOGICAL_SAMPLE,CATCH_FREQUENCY,CATCHMENT_INDEX,CATCHMENT_NAME,distinct,FILE_TYPE,FISH_SIZE,FISH_STATUS,fork_length,FORK_LENGTH,FORK_LENGTH_INTERVAL_WIDTH,GD_ID,id,life_stage_id,matching_id,MATURITY,notes,ORIGIN,RECORD_IDENTIFIER,river_age,RIVER_AGE,SCALE_SAMPLE,SCALE_SAMPLE_ID,SEX,sex_id,SITE,SITE_EVENT_CODE,SPECIES_ITIS_CODE,SPECIES_LIFE_STAGE,status_id,SURVEY,SURVEY_TYPE,SWEEP_NUMBER,total_length,TOTAL_LENGTH,weight,WEIGHT,WEIGHT_PRECISION,WEIGHT_RESOLUTION


In [14]:
# fork length is always the same
df_merged[df_merged.fork_length.fillna(0) != df_merged.FORK_LENGTH.fillna(0).astype(int)]

Unnamed: 0,sample_id,old_id,DATETIME,age_type,AGE_TYPE,BIOLOGICAL_REMARKS,BIOLOGICAL_SAMPLE,CATCH_FREQUENCY,CATCHMENT_INDEX,CATCHMENT_NAME,distinct,FILE_TYPE,FISH_SIZE,FISH_STATUS,fork_length,FORK_LENGTH,FORK_LENGTH_INTERVAL_WIDTH,GD_ID,id,life_stage_id,matching_id,MATURITY,notes,ORIGIN,RECORD_IDENTIFIER,river_age,RIVER_AGE,SCALE_SAMPLE,SCALE_SAMPLE_ID,SEX,sex_id,SITE,SITE_EVENT_CODE,SPECIES_ITIS_CODE,SPECIES_LIFE_STAGE,status_id,SURVEY,SURVEY_TYPE,SWEEP_NUMBER,total_length,TOTAL_LENGTH,weight,WEIGHT,WEIGHT_PRECISION,WEIGHT_RESOLUTION


In [15]:
# nope.
df_merged.MATURITY.unique()

array([nan])

In [16]:
# Sex is always the same
df_merged[df_merged.SEX.fillna('U') != df_merged.sex_id.fillna(0).map({0: 'U', 1:'M', 2:'F'})]

Unnamed: 0,sample_id,old_id,DATETIME,age_type,AGE_TYPE,BIOLOGICAL_REMARKS,BIOLOGICAL_SAMPLE,CATCH_FREQUENCY,CATCHMENT_INDEX,CATCHMENT_NAME,distinct,FILE_TYPE,FISH_SIZE,FISH_STATUS,fork_length,FORK_LENGTH,FORK_LENGTH_INTERVAL_WIDTH,GD_ID,id,life_stage_id,matching_id,MATURITY,notes,ORIGIN,RECORD_IDENTIFIER,river_age,RIVER_AGE,SCALE_SAMPLE,SCALE_SAMPLE_ID,SEX,sex_id,SITE,SITE_EVENT_CODE,SPECIES_ITIS_CODE,SPECIES_LIFE_STAGE,status_id,SURVEY,SURVEY_TYPE,SWEEP_NUMBER,total_length,TOTAL_LENGTH,weight,WEIGHT,WEIGHT_PRECISION,WEIGHT_RESOLUTION


In [17]:
# total length is always the same
df_merged[df_merged.total_length.fillna(0) != df_merged.TOTAL_LENGTH.fillna(0)]

Unnamed: 0,sample_id,old_id,DATETIME,age_type,AGE_TYPE,BIOLOGICAL_REMARKS,BIOLOGICAL_SAMPLE,CATCH_FREQUENCY,CATCHMENT_INDEX,CATCHMENT_NAME,distinct,FILE_TYPE,FISH_SIZE,FISH_STATUS,fork_length,FORK_LENGTH,FORK_LENGTH_INTERVAL_WIDTH,GD_ID,id,life_stage_id,matching_id,MATURITY,notes,ORIGIN,RECORD_IDENTIFIER,river_age,RIVER_AGE,SCALE_SAMPLE,SCALE_SAMPLE_ID,SEX,sex_id,SITE,SITE_EVENT_CODE,SPECIES_ITIS_CODE,SPECIES_LIFE_STAGE,status_id,SURVEY,SURVEY_TYPE,SWEEP_NUMBER,total_length,TOTAL_LENGTH,weight,WEIGHT,WEIGHT_PRECISION,WEIGHT_RESOLUTION


In [18]:
# weight is always the same
df_merged[df_merged.weight.fillna(0) != df_merged.WEIGHT.fillna(0)]

Unnamed: 0,sample_id,old_id,DATETIME,age_type,AGE_TYPE,BIOLOGICAL_REMARKS,BIOLOGICAL_SAMPLE,CATCH_FREQUENCY,CATCHMENT_INDEX,CATCHMENT_NAME,distinct,FILE_TYPE,FISH_SIZE,FISH_STATUS,fork_length,FORK_LENGTH,FORK_LENGTH_INTERVAL_WIDTH,GD_ID,id,life_stage_id,matching_id,MATURITY,notes,ORIGIN,RECORD_IDENTIFIER,river_age,RIVER_AGE,SCALE_SAMPLE,SCALE_SAMPLE_ID,SEX,sex_id,SITE,SITE_EVENT_CODE,SPECIES_ITIS_CODE,SPECIES_LIFE_STAGE,status_id,SURVEY,SURVEY_TYPE,SWEEP_NUMBER,total_length,TOTAL_LENGTH,weight,WEIGHT,WEIGHT_PRECISION,WEIGHT_RESOLUTION


In [19]:
# weight is always the same
df_merged[df_merged.river_age.fillna(0) != df_merged.RIVER_AGE.fillna(0)]

Unnamed: 0,sample_id,old_id,DATETIME,age_type,AGE_TYPE,BIOLOGICAL_REMARKS,BIOLOGICAL_SAMPLE,CATCH_FREQUENCY,CATCHMENT_INDEX,CATCHMENT_NAME,distinct,FILE_TYPE,FISH_SIZE,FISH_STATUS,fork_length,FORK_LENGTH,FORK_LENGTH_INTERVAL_WIDTH,GD_ID,id,life_stage_id,matching_id,MATURITY,notes,ORIGIN,RECORD_IDENTIFIER,river_age,RIVER_AGE,SCALE_SAMPLE,SCALE_SAMPLE_ID,SEX,sex_id,SITE,SITE_EVENT_CODE,SPECIES_ITIS_CODE,SPECIES_LIFE_STAGE,status_id,SURVEY,SURVEY_TYPE,SWEEP_NUMBER,total_length,TOTAL_LENGTH,weight,WEIGHT,WEIGHT_PRECISION,WEIGHT_RESOLUTION


# are there any matches between GD_ID and old_id from specimen table?

In [59]:
id_arch_spec = list(pd.merge(df_spec, df_archive, on='old_id').old_id.unique())  # archived data in spec is not in hist
id_hist_spec = list(pd.merge(df_spec, df_hist, on='old_id').old_id.unique())  # no overlap between old_id in hist and spec
id_arch_hist = list(pd.merge(df_hist, df_archive, on='old_id').old_id.unique())  # archived data in hist is not in spec

len(id_arch_spec), len(id_hist_spec), len(id_arch_hist)

(18992, 0, 27524)

In [64]:
len([x for x in id_arch_spec if x not in id_hist_spec]), len([x for x in id_arch_spec if x not in id_arch_hist])

(18992, 18992)

In [65]:
len([x for x in id_arch_hist if x not in id_arch_spec]), len([x for x in id_arch_hist if x not in id_hist_spec])

(27524, 27524)

In [67]:
len([x for x in id_arch_spec if x in id_arch_hist]), len([x for x in id_arch_hist if x in id_arch_spec])

(0, 0)

In [76]:
temp = pd.merge(df_archive, df_spec[df_spec.matching_id.isin(matches)], on='old_id')
temp[temp.matching_id.isin(df_hist.matching_id.unique())].old_id.nunique()

5868

In [81]:
# ok, how many potential matches don't have old_id? none. all spec have old_id, and it is a perfect predictor of df_hist entry
sum(df_spec.old_id.isnull())

0

In [84]:
# ok, ok, now what about if we check for combos? ie, does arch include hist PLUS spec?

df_spec.head()
# need to join with samples to get date info

Unnamed: 0,id,fork_length,weight,river_age,notes,sample_id,sex_id,status_id,age_type,sweep_id,life_stage_id,old_id,smart_river_age,smart_river_age_type,matching_id,distinct
0,1708668,38,,0.0,,4467,,10.0,2.0,595.0,11.0,GD_2780,0.0,2.0,4467380_00,False
1,1708669,73,,1.0,,4467,,10.0,2.0,595.0,1.0,GD_2781,1.0,2.0,4467730_00,False
2,1708670,83,,1.0,,4467,,10.0,2.0,595.0,1.0,GD_2782,1.0,2.0,4467830_00,False
3,1708671,83,,1.0,,4467,,10.0,2.0,595.0,1.0,GD_2782,1.0,2.0,4467830_00,False
4,1708672,88,,1.0,,4467,,10.0,2.0,595.0,1.0,GD_2783,1.0,2.0,4467880_00,False


In [118]:
# merge and group by sample
# pd.merge(df_hist, df_spec, on='sample_id')
all_fish = pd.concat([
    df_hist[['id', 'old_id', 'sample_id', 'matching_id']].rename({'id':'hist_id'}, axis=1),
    df_spec[['id', 'old_id', 'sample_id', 'matching_id']].rename({'id':'spec_id'}, axis=1)  # only samples that contain historical fishes are included in df_spec
], axis=0)[['old_id', 'spec_id', 'hist_id', 'sample_id', 'matching_id']]
all_fish.shape

(102190, 5)

In [119]:
all_fish = pd.merge(all_fish, df_archive[['old_id', 'RECORD_IDENTIFIER']], on='old_id', how='left')
all_fish.shape

(102190, 6)

In [121]:
# EVERY SINGLE FISH has a record identifier!
sum(all_fish.RECORD_IDENTIFIER.isnull())

0

# I think that means that all historical data that can be matched was, and then deleted. the two lists are distinct.
# this sounds wrong. so I need to confirm using a different method.
# TODO: 
* merge archived data with sample number
* count number of fish in each sample
* is this number equal to df_spec PLUS df_hist?
    * if so, that would confirm these calculations
# also todo: clean up these calculations

In [22]:
# You shall not pass!
raise ValueError

ValueError: 

In [40]:
temp = pd.merge(df_archive, df_spec, on='old_id')
temp[temp.matching_id.isin(matches)].shape[0], temp.shape[0]

(32373, 74666)

In [24]:
# this looks like 32k verifiably incorrect matches... I think I am missing something...
df_merged.matching_id

0        4390340_51
1        4390340_62
2        4390360_51
3        4390360_61
4        4390361_31
            ...    
27519    8001430_00
27520    8001430_00
27521    8001790_00
27522    8001880_00
27523    8001910_00
Name: matching_id, Length: 27524, dtype: object

In [26]:
# why is there 0 overlap in old_id? maybe everything that was matched was deleted?
pd.merge(df_hist, df_spec, on='old_id')

Unnamed: 0,id_x,fork_length_x,total_length,weight_x,age_type_x,river_age_x,notes_x,old_id,life_stage_id_x,sample_id_x,sex_id_x,status_id_x,matching_id_x,distinct_x,id_y,fork_length_y,weight_y,river_age_y,notes_y,sample_id_y,sex_id_y,status_id_y,age_type_y,sweep_id,life_stage_id_y,smart_river_age,smart_river_age_type,matching_id_y,distinct_y


In [34]:
df_archive[df_archive.old_id=='GD_560']

Unnamed: 0,old_id,DATETIME,SURVEY,SITE,CATCHMENT_NAME,CATCHMENT_INDEX,SURVEY_TYPE,SITE_EVENT_CODE,SWEEP_NUMBER,RECORD_IDENTIFIER,SPECIES_ITIS_CODE,SPECIES_LIFE_STAGE,ORIGIN,FISH_STATUS,FORK_LENGTH,FORK_LENGTH_INTERVAL_WIDTH,TOTAL_LENGTH,WEIGHT,WEIGHT_RESOLUTION,WEIGHT_PRECISION,FILE_TYPE,CATCH_FREQUENCY,FISH_SIZE,RIVER_AGE,AGE_TYPE,SEX,MATURITY,BIOLOGICAL_REMARKS,BIOLOGICAL_SAMPLE,SCALE_SAMPLE,SCALE_SAMPLE_ID,GD_ID
559,GD_560,1970-08-19,1970,42,MIRAMICHI,2,1,1,1.0,8624996,161996,1731.0,W,RS,103.0,5.0,,,,,1.0,4,PARR,2.0,LGTHFREQ,,,,1.0,,,560


In [33]:
df_spec[df_spec.old_id=='GD_560']

Unnamed: 0,id,fork_length,weight,river_age,notes,sample_id,sex_id,status_id,age_type,sweep_id,life_stage_id,old_id,smart_river_age,smart_river_age_type,matching_id,distinct
931,2221627,103,,2.0,,4412,,10.0,2.0,340.0,1.0,GD_560,2.0,2.0,44121030_00,False
932,2221628,103,,2.0,,4412,,10.0,2.0,340.0,1.0,GD_560,2.0,2.0,44121030_00,False
933,2221629,103,,2.0,,4412,,10.0,2.0,340.0,1.0,GD_560,2.0,2.0,44121030_00,False
934,2221630,103,,2.0,,4412,,10.0,2.0,340.0,1.0,GD_560,2.0,2.0,44121030_00,False


In [35]:
df_hist[df_hist.matching_id=='44121030_00']

Unnamed: 0,id,fork_length,total_length,weight,age_type,river_age,notes,old_id,life_stage_id,sample_id,sex_id,status_id,matching_id,distinct
226,69578,103,,,1.0,2.0,,GD_548,1,4412,,10,44121030_00,True


In [36]:
df_archive[df_archive.old_id=='GD_548']

Unnamed: 0,old_id,DATETIME,SURVEY,SITE,CATCHMENT_NAME,CATCHMENT_INDEX,SURVEY_TYPE,SITE_EVENT_CODE,SWEEP_NUMBER,RECORD_IDENTIFIER,SPECIES_ITIS_CODE,SPECIES_LIFE_STAGE,ORIGIN,FISH_STATUS,FORK_LENGTH,FORK_LENGTH_INTERVAL_WIDTH,TOTAL_LENGTH,WEIGHT,WEIGHT_RESOLUTION,WEIGHT_PRECISION,FILE_TYPE,CATCH_FREQUENCY,FISH_SIZE,RIVER_AGE,AGE_TYPE,SEX,MATURITY,BIOLOGICAL_REMARKS,BIOLOGICAL_SAMPLE,SCALE_SAMPLE,SCALE_SAMPLE_ID,GD_ID
547,GD_548,1970-08-19,1970,42,MIRAMICHI,2,1,1,0.0,8624981,161996,1731.0,W,RS,103.0,1.0,,,,,2.0,1,PARR,2.0,SCALE,,,,1.0,,,548


In [38]:
# df_spec - raw
pd.read_csv(r'.\csv\trapnet_specimen_202302231329.csv', low_memory=False)

Unnamed: 0,id,created_at,updated_at,fork_length,total_length,weight,river_age,scale_id_number,notes,created_by_id,origin_id,sample_id,sex_id,species_id,status_id,updated_by_id,tag_number,age_type,ocean_age,sweep_id,life_stage_id,reproductive_status_id,adipose_condition,old_id,fork_length_bin_interval,maturity_id,smart_river_age,smart_river_age_type
0,253146,2021-11-02 18:35:41.753618,2022-12-12 21:52:53.84318,97.000,,11.330,,EFRP5-1 2021,,40.000,2.000,1794,3.000,79,10.000,40.000,,,,1.000,1.000,,1.000,,1.000,,,
1,253147,2021-11-02 18:44:24.433456,2022-12-12 21:52:53.893623,86.000,,7.660,,EFRP5-2 2021,,40.000,2.000,1794,1.000,79,10.000,40.000,,,,1.000,1.000,,1.000,,1.000,,,
2,253148,2021-11-02 18:44:24.445103,2022-12-12 21:52:53.943221,113.000,,13.320,,EFRP5-3 2021,,40.000,2.000,1794,3.000,79,10.000,40.000,,,,1.000,1.000,,1.000,,1.000,,,
3,253149,2021-11-02 18:44:24.4852,2022-12-12 21:52:53.993065,126.000,,23.370,,EFRP5-4 2021,,40.000,2.000,1794,1.000,79,10.000,40.000,,,,1.000,1.000,,1.000,,1.000,,,
4,253150,2021-11-02 18:44:24.500002,2022-12-12 21:52:54.060298,85.000,,7.830,,EFRP5-5 2021,,40.000,2.000,1794,1.000,79,10.000,40.000,,,,1.000,1.000,,1.000,,1.000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1006673,2436015,2023-02-03 14:57:19.599493,2023-02-03 14:57:19.599524,,,,,,,2435.000,,4140,,79,5.000,,,,,,1.000,,,,1.000,,,
1006674,2436016,2023-02-03 14:57:19.693873,2023-02-03 14:57:19.693902,,,,,,,2435.000,,4140,,79,5.000,,,,,,1.000,,,,1.000,,,
1006675,2436017,2023-02-03 14:57:19.760322,2023-02-03 14:57:19.760369,,,,,,,2435.000,,4140,,79,5.000,,,,,,1.000,,,,1.000,,,
1006676,2436018,2023-02-03 14:57:19.771258,2023-02-03 14:57:19.771303,,,,,,,2435.000,,4140,,79,5.000,,,,,,1.000,,,,1.000,,,


# could we get better matches using historical data instead of dm_apps data?

# samples with potential matches

In [None]:
len(df_hist.sample_id.unique()), len(df_spec.sample_id.unique())

# old_id will not work

In [None]:
# old_id never matches
pd.merge(df_hist, df_spec, on='old_id')

In [None]:
df_hist = df_hist.drop('old_id', axis=1)
df_spec = df_spec.drop('old_id', axis=1)

# how to identify distinct fish for matching

In [None]:
# distinct matches for 60% of hist fish
df_hist.groupby(['sample_id', 'fork_length', 'weight', 'sex_id', 'life_stage_id', 'river_age', 'status_id'], dropna=False).count().id.describe(percentiles=[.65, .70, .90])

In [None]:
df_hist.groupby(['sample_id', 'fork_length', 'weight', 'sex_id'], dropna=False).count().id.describe(percentiles=[.65, .70, .90])

In [None]:
df_hist.groupby(['sample_id', 'fork_length', 'weight'], dropna=False).count().id.describe(percentiles=[.65, .70, .90])

In [None]:
groupby = ['sample_id', 'fork_length', 'weight', 'sex_id', 'life_stage_id', 'river_age', 'status_id']
df_spec.groupby(groupby, dropna=False).count().id.describe(percentiles=[.30, .35])

In [None]:
groupby = ['sample_id', 'fork_length', 'weight', 'sex_id']
df_spec.groupby(groupby, dropna=False).count().id.describe(percentiles=[.30, .35])

In [None]:
groupby = ['sample_id', 'fork_length', 'weight']
df_spec.groupby(groupby, dropna=False).count().id.describe(percentiles=[.30, .35])

### percentages of distinct fish
* about 60% of historical fish groupings can be uniquely identified by length, weight, and id
* about 30% of specimen fish groupings can be uniquely identified by length, weight, and id
* sex_id differentiates at least one indistinct example, so just add it
* maybe exact matches don't matter - same attributes, same sample, maybe just matching in order would work

##### how many matches are distinct?

In [None]:
df_spec[df_spec.matching_id.isin(matches)].distinct.value_counts()

In [None]:
df_hist[df_hist.matching_id.isin(matches)].distinct.value_counts()

In [None]:
# indistinct matches - from specimen
df_spec[df_spec.matching_id.isin(matches)].shape[0], df_spec.shape[0]

In [None]:
# indistinct matches - from historical
df_hist[df_hist.matching_id.isin(matches)].shape[0], df_hist.shape[0]

In [None]:
# missing from
len([x for x in spec_ids if x not in hist_ids]), len([x for x in hist_ids if x not in spec_ids])
# hist, spec

In [None]:
# matched
len([x for x in spec_ids if x in hist_ids]), len([x for x in hist_ids if x in spec_ids])

In [None]:
# total number of matching ids
len(hist_ids), len(spec_ids)

In [None]:
# distinct exact matches
distinct_matched_hist = list(df_hist[(df_hist.matching_id.isin(matches)) & df_hist.distinct==True].matching_id)
df_spec[(df_spec.matching_id.isin(distinct_matched_hist)) & (df_spec.distinct==True)].shape[0]

# let's build a throwaway html to list all of the potential matches

    -- list of sample id with historical and specimen data
    SELECT trapnet_sample.id
        FROM trapnet_sample
            JOIN trapnet_biologicaldetailing ON trapnet_biologicaldetailing.sample_id = trapnet_sample.id
            JOIN trapnet_specimen ON trapnet_specimen.sample_id = trapnet_sample.id
    GROUP BY trapnet_sample.id

In [None]:
# df_matching_id = pd.read_csv(r'.\csv\SQL_matching_id.csv')
# link_template = '<a href="http://127.0.0.1:8000/en/trapnet/samples/{}/view/">{}</a>'

# with open('all_matches.html', 'w') as f:
#     for i, row in df_matching_id.iterrows():
#         pk = row['id']
#         f.write(link_template.format(pk, pk) + '\n')