In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# no text wrapping
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 200)

In [2]:
df_hist = pd.read_csv(r'.\csv\trapnet_biologicaldetailing_202302231329.csv', low_memory=False)
df_spec = pd.read_csv(r'.\csv\trapnet_specimen_202302231329.csv', low_memory=False)

# historical data without trivial fields (fields with only one entry)
unmatchable_hist = ['created_at', 'updated_at']
df_hist = df_hist.drop(unmatchable_hist, axis=1).loc[:, df_hist.nunique() > 1]
hist_sample_id_list = sorted(list(df_hist.sample_id.unique()))

# all species in historical data are 79 (atlantic salmon)
unmatchable_spec = ['created_at', 'updated_at', 'created_by_id', 'updated_by_id']
df_spec = df_spec[(df_spec.species_id==79) & (df_spec.sample_id.isin(hist_sample_id_list))].reset_index(drop=True)
df_spec = df_spec.drop(unmatchable_spec, axis=1).loc[:, df_spec.nunique() > 1]

In [3]:
# cast to int where appropriate
df_hist[['fork_length', 'sex_id']] = df_hist[['fork_length', 'sex_id']].astype('Int64')
df_spec[['fork_length', 'sex_id']] = df_spec[['fork_length', 'sex_id']].astype('Int64')

# samples with potential matches

In [4]:
len(df_hist.sample_id.unique()), len(df_spec.sample_id.unique())

(793, 775)

# old_id will not work

In [5]:
# old_id never matches
pd.merge(df_hist, df_spec, on='old_id')

Unnamed: 0,id_x,fork_length_x,total_length,weight_x,age_type_x,river_age_x,notes_x,old_id,life_stage_id_x,sample_id_x,sex_id_x,status_id_x,id_y,fork_length_y,weight_y,river_age_y,notes_y,sample_id_y,sex_id_y,status_id_y,age_type_y,sweep_id,life_stage_id_y,smart_river_age,smart_river_age_type


In [6]:
df_hist = df_hist.drop('old_id', axis=1)
df_spec = df_spec.drop('old_id', axis=1)

# how to identify distinct fish for matching

In [9]:
# distinct matches for 60% of hist fish
df_hist.groupby(['sample_id', 'fork_length', 'weight', 'sex_id', 'life_stage_id', 'river_age', 'status_id'], dropna=False).count().id.describe(percentiles=[.65, .70, .90])

count   14660.000
mean        1.877
std         2.399
min         1.000
50%         1.000
65%         1.000
70%         2.000
90%         4.000
max        50.000
Name: id, dtype: float64

In [10]:
df_hist.groupby(['sample_id', 'fork_length', 'weight', 'sex_id'], dropna=False).count().id.describe(percentiles=[.65, .70, .90])

count   14522.000
mean        1.895
std         2.413
min         1.000
50%         1.000
65%         1.000
70%         2.000
90%         4.000
max        50.000
Name: id, dtype: float64

In [11]:
df_hist.groupby(['sample_id', 'fork_length', 'weight'], dropna=False).count().id.describe(percentiles=[.65, .70, .90])

count   14433.000
mean        1.907
std         2.419
min         1.000
50%         1.000
65%         1.000
70%         2.000
90%         4.000
max        50.000
Name: id, dtype: float64

In [12]:
groupby = ['sample_id', 'fork_length', 'weight', 'sex_id', 'life_stage_id', 'river_age', 'status_id']
df_spec.groupby(groupby, dropna=False).count().id.describe(percentiles=[.30, .35])

count   8770.000
mean       8.514
std       19.040
min        1.000
30%        1.000
35%        2.000
50%        3.000
max      425.000
Name: id, dtype: float64

In [13]:
groupby = ['sample_id', 'fork_length', 'weight', 'sex_id']
df_spec.groupby(groupby, dropna=False).count().id.describe(percentiles=[.30, .35])

count   8768.000
mean       8.516
std       19.042
min        1.000
30%        1.000
35%        2.000
50%        3.000
max      425.000
Name: id, dtype: float64

In [14]:
groupby = ['sample_id', 'fork_length', 'weight']
df_spec.groupby(groupby, dropna=False).count().id.describe(percentiles=[.30, .35])

count   8767.000
mean       8.517
std       19.043
min        1.000
30%        1.000
35%        2.000
50%        3.000
max      425.000
Name: id, dtype: float64

### percentages of distinct fish
* about 60% of historical fish groupings can be uniquely identified by length, weight, and id
* about 30% of specimen fish groupings can be uniquely identified by length, weight, and id
* sex_id differentiates at least one indistinct example, so just add it
* maybe exact matches don't matter - same attributes, same sample, maybe just matching in order would work

In [18]:
df_hist['matching_id'] = df_hist['sample_id'].astype(str) + df_hist['fork_length'].astype(str) + df_hist['weight'].fillna(0).astype(str) + df_hist['sex_id'].fillna(0).astype(str)
df_hist['matching_id'] = df_hist['matching_id'].str.replace('.', '_', regex=False)
df_hist['distinct'] = ~df_hist.matching_id.isin(df_hist[df_hist.matching_id.duplicated()].matching_id.unique())

In [19]:
df_spec['matching_id'] = df_spec['sample_id'].astype(str) + df_spec['fork_length'].astype(str) + df_spec['weight'].fillna(0).astype(str) + df_spec['sex_id'].fillna(0).astype(str)
df_spec['matching_id'] = df_spec['matching_id'].str.replace('.', '_', regex=False)
df_spec['distinct'] = ~df_spec.matching_id.isin(df_spec[df_spec.matching_id.duplicated()].matching_id.unique())

In [24]:
spec_ids = list(df_spec.matching_id.unique())
hist_ids = list(df_hist.matching_id.unique())

In [43]:
# indistinct matches - from specimen
df_spec[df_spec.matching_id.isin(matches)].shape[0], df_spec.shape[0]

(32373, 74666)

In [44]:
# indistinct matches - from historical
df_hist[df_hist.matching_id.isin(matches)].shape[0], df_hist.shape[0]

(5224, 27524)

In [26]:
# missing from
len([x for x in spec_ids if x not in hist_ids]), len([x for x in hist_ids if x not in spec_ids])
# hist, spec

(6706, 12460)

In [27]:
# matched
len([x for x in spec_ids if x in hist_ids]), len([x for x in hist_ids if x in spec_ids])

(2062, 2062)

In [28]:
# total number of matching ids
len(hist_ids), len(spec_ids)

(14522, 8768)

In [29]:
# how many matches are distinct?
matches = [x for x in spec_ids if x in hist_ids]

In [32]:
df_spec[df_spec.matching_id.isin(matches)].distinct.value_counts()

False    32015
True       358
Name: distinct, dtype: int64

In [33]:
df_hist[df_hist.matching_id.isin(matches)].distinct.value_counts()

False    4183
True     1041
Name: distinct, dtype: int64

In [54]:
# distinct exact matches
distinct_matched_hist = list(df_hist[(df_hist.matching_id.isin(matches)) & df_hist.distinct==True].matching_id)
df_spec[(df_spec.matching_id.isin(distinct_matched_hist)) & (df_spec.distinct==True)].shape[0]

271

# let's build a throwaway html to list all of the potential matches

    -- list of sample id with historical and specimen data
    SELECT trapnet_sample.id
        FROM trapnet_sample
            JOIN trapnet_biologicaldetailing ON trapnet_biologicaldetailing.sample_id = trapnet_sample.id
            JOIN trapnet_specimen ON trapnet_specimen.sample_id = trapnet_sample.id
    GROUP BY trapnet_sample.id

In [13]:
df_matching_id = pd.read_csv(r'.\csv\SQL_matching_id.csv')
link_template = '<a href="http://127.0.0.1:8000/en/trapnet/samples/{}/view/">{}</a>'

with open('all_matches.html', 'w') as f:
    for i, row in df_matching_id.iterrows():
        pk = row['id']
        f.write(link_template.format(pk, pk) + '\n')

# 