In [106]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# no text wrapping
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 200)

In [41]:
df_hist = pd.read_csv(r'.\csv\trapnet_biologicaldetailing_202302231329.csv', low_memory=False)
df_spec = pd.read_csv(r'.\csv\trapnet_specimen_202302231329.csv', low_memory=False)

# historical data without trivial fields (fields with only one entry)
unmatchable_hist = ['created_at', 'updated_at']
df_hist = df_hist.drop(unmatchable_hist, axis=1).loc[:, df_hist.nunique() > 1]
hist_sample_id_list = sorted(list(df_hist.sample_id.unique()))

# all species in historical data are 79 (atlantic salmon)
unmatchable_spec = ['created_at', 'updated_at', 'created_by_id', 'updated_by_id']
df_spec = df_spec[(df_spec.species_id==79) & (df_spec.sample_id.isin(hist_sample_id_list))].reset_index(drop=True)
df_spec = df_spec.drop(unmatchable_spec, axis=1).loc[:, df_spec.nunique() > 1]

# samples with potential matches

In [46]:
len(hist_sample_id), len(df_spec.sample_id.unique())

(793, 775)

# old_id will not work

In [53]:
# old_id never matches
pd.merge(df_hist, df_spec, on='old_id')

Unnamed: 0,id_x,fork_length_x,total_length,weight_x,age_type_x,river_age_x,notes_x,old_id,life_stage_id_x,sample_id_x,sex_id_x,status_id_x,id_y,fork_length_y,weight_y,river_age_y,notes_y,sample_id_y,sex_id_y,status_id_y,age_type_y,sweep_id,life_stage_id_y,smart_river_age,smart_river_age_type


In [124]:
df_hist = df_hist.drop('old_id', axis=1)
df_spec = df_spec.drop('old_id', axis=1)

In [125]:
df_hist.head()

Unnamed: 0,id,fork_length,total_length,weight,age_type,river_age,notes,life_stage_id,sample_id,sex_id,status_id
0,69352,34.0,,0.5,1.0,0.0,,11,4390,1.0,4
1,69353,34.0,,0.6,1.0,0.0,,11,4390,2.0,4
2,69354,36.0,,0.5,1.0,0.0,,11,4390,1.0,4
3,69355,36.0,,0.6,1.0,0.0,,11,4390,1.0,4
4,69356,36.0,,1.3,1.0,0.0,,11,4390,1.0,4


In [126]:
df_spec.head()

Unnamed: 0,id,fork_length,weight,river_age,notes,sample_id,sex_id,status_id,age_type,sweep_id,life_stage_id,smart_river_age,smart_river_age_type
0,1708668,38.0,,0.0,,4467,,10.0,2.0,595.0,11.0,0.0,2.0
1,1708669,73.0,,1.0,,4467,,10.0,2.0,595.0,1.0,1.0,2.0
2,1708670,83.0,,1.0,,4467,,10.0,2.0,595.0,1.0,1.0,2.0
3,1708671,83.0,,1.0,,4467,,10.0,2.0,595.0,1.0,1.0,2.0
4,1708672,88.0,,1.0,,4467,,10.0,2.0,595.0,1.0,1.0,2.0


# how to identify distinct fish for matching

In [176]:
# distinct matches for 60% of hist fish
df_hist.groupby(['sample_id', 'fork_length', 'weight', 'sex_id', 'life_stage_id', 'river_age', 'status_id'], dropna=False).count().id.describe(percentiles=[.65, .70, .90])

count   14660.000
mean        1.877
std         2.399
min         1.000
50%         1.000
65%         1.000
70%         2.000
90%         4.000
max        50.000
Name: id, dtype: float64

In [177]:
df_hist.groupby(['sample_id', 'fork_length', 'weight', 'sex_id'], dropna=False).count().id.describe(percentiles=[.65, .70, .90])

count   14522.000
mean        1.895
std         2.413
min         1.000
50%         1.000
65%         1.000
70%         2.000
90%         4.000
max        50.000
Name: id, dtype: float64

In [178]:
df_hist.groupby(['sample_id', 'fork_length', 'weight'], dropna=False).count().id.describe(percentiles=[.65, .70, .90])

count   14433.000
mean        1.907
std         2.419
min         1.000
50%         1.000
65%         1.000
70%         2.000
90%         4.000
max        50.000
Name: id, dtype: float64

In [183]:
groupby = ['sample_id', 'fork_length', 'weight', 'sex_id', 'life_stage_id', 'river_age', 'status_id']
df_spec.groupby(groupby, dropna=False).count().id.describe(percentiles=[.30, .35])

count   8770.000
mean       8.514
std       19.040
min        1.000
30%        1.000
35%        2.000
50%        3.000
max      425.000
Name: id, dtype: float64

In [185]:
groupby = ['sample_id', 'fork_length', 'weight', 'sex_id']
df_spec.groupby(groupby, dropna=False).count().id.describe(percentiles=[.30, .35])

count   8768.000
mean       8.516
std       19.042
min        1.000
30%        1.000
35%        2.000
50%        3.000
max      425.000
Name: id, dtype: float64

In [187]:
groupby = ['sample_id', 'fork_length', 'weight']
df_spec.groupby(groupby, dropna=False).count().id.describe(percentiles=[.30, .35])

count   8767.000
mean       8.517
std       19.043
min        1.000
30%        1.000
35%        2.000
40%        2.000
50%        3.000
max      425.000
Name: id, dtype: float64

### percentages of distinct fish
* about 60% of historical fish can be uniquely identified by length, weight, and id
* about 30% of specimen fish can be uniquely identified by length, weight, and id
* no notable improvements in number of distinctly identified fish when adding 'sex_id', 'life_stage_id', 'river_age', 'status_id'
* maybe exact matches don't matter - same attributes, same sample, maybe just matching in order would work