In [106]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# no text wrapping
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 200)

In [41]:
df_hist = pd.read_csv(r'.\csv\trapnet_biologicaldetailing_202302231329.csv', low_memory=False)
df_spec = pd.read_csv(r'.\csv\trapnet_specimen_202302231329.csv', low_memory=False)

# historical data without trivial fields (fields with only one entry)
unmatchable_hist = ['created_at', 'updated_at']
df_hist = df_hist.drop(unmatchable_hist, axis=1).loc[:, df_hist.nunique() > 1]
hist_sample_id_list = sorted(list(df_hist.sample_id.unique()))

# all species in historical data are 79 (atlantic salmon)
unmatchable_spec = ['created_at', 'updated_at', 'created_by_id', 'updated_by_id']
df_spec = df_spec[(df_spec.species_id==79) & (df_spec.sample_id.isin(hist_sample_id_list))].reset_index(drop=True)
df_spec = df_spec.drop(unmatchable_spec, axis=1).loc[:, df_spec.nunique() > 1]

In [42]:
df_hist.head()

Unnamed: 0,id,fork_length,total_length,weight,age_type,river_age,notes,old_id,life_stage_id,sample_id,sex_id,status_id
0,69352,34.0,,0.5,1.0,0.0,,GD_1,11,4390,1.0,4
1,69353,34.0,,0.6,1.0,0.0,,GD_2,11,4390,2.0,4
2,69354,36.0,,0.5,1.0,0.0,,GD_3,11,4390,1.0,4
3,69355,36.0,,0.6,1.0,0.0,,GD_4,11,4390,1.0,4
4,69356,36.0,,1.3,1.0,0.0,,GD_5,11,4390,1.0,4


In [43]:
df_spec.head()

Unnamed: 0,id,fork_length,weight,river_age,notes,sample_id,sex_id,status_id,age_type,sweep_id,life_stage_id,old_id,smart_river_age,smart_river_age_type
0,1708668,38.0,,0.0,,4467,,10.0,2.0,595.0,11.0,GD_2780,0.0,2.0
1,1708669,73.0,,1.0,,4467,,10.0,2.0,595.0,1.0,GD_2781,1.0,2.0
2,1708670,83.0,,1.0,,4467,,10.0,2.0,595.0,1.0,GD_2782,1.0,2.0
3,1708671,83.0,,1.0,,4467,,10.0,2.0,595.0,1.0,GD_2782,1.0,2.0
4,1708672,88.0,,1.0,,4467,,10.0,2.0,595.0,1.0,GD_2783,1.0,2.0


In [46]:
len(hist_sample_id), len(df_spec.sample_id.unique())

(793, 775)

### old_id = won't match

In [53]:
# old_id never matches
pd.merge(df_hist, df_spec, on='old_id')

Unnamed: 0,id_x,fork_length_x,total_length,weight_x,age_type_x,river_age_x,notes_x,old_id,life_stage_id_x,sample_id_x,sex_id_x,status_id_x,id_y,fork_length_y,weight_y,river_age_y,notes_y,sample_id_y,sex_id_y,status_id_y,age_type_y,sweep_id,life_stage_id_y,smart_river_age,smart_river_age_type


In [64]:
df_hist.old_id.str[3:].astype(int).tail()

27519    207005
27520    207006
27521    207011
27522    207012
27523    207014
Name: old_id, dtype: int32

In [66]:
df_spec.old_id.str[3:].astype(int).tail(10)

74656    207008
74657    207009
74658    207009
74659    207009
74660    207009
74661    207010
74662    207013
74663    207015
74664    207016
74665    207017
Name: old_id, dtype: int32

In [67]:
# duplicated old_id in df_spec seem to match
df_spec.tail(10)

Unnamed: 0,id,fork_length,weight,river_age,notes,sample_id,sex_id,status_id,age_type,sweep_id,life_stage_id,old_id,smart_river_age,smart_river_age_type
74656,2430203,43.0,,0.0,,8001,,10.0,2.0,10861.0,11.0,GD_207008,0.0,2.0
74657,2430204,43.0,,0.0,,8001,,10.0,2.0,10863.0,11.0,GD_207009,0.0,2.0
74658,2430205,43.0,,0.0,,8001,,10.0,2.0,10863.0,11.0,GD_207009,0.0,2.0
74659,2430206,43.0,,0.0,,8001,,10.0,2.0,10863.0,11.0,GD_207009,0.0,2.0
74660,2430207,43.0,,0.0,,8001,,10.0,2.0,10863.0,11.0,GD_207009,0.0,2.0
74661,2430208,78.0,,1.0,,8001,,10.0,2.0,10861.0,1.0,GD_207010,1.0,2.0
74662,2430209,88.0,,1.0,,8001,,10.0,2.0,10861.0,1.0,GD_207013,1.0,2.0
74663,2430210,93.0,,1.0,,8001,,10.0,2.0,10861.0,1.0,GD_207015,1.0,2.0
74664,2430211,95.0,,1.0,,8001,,10.0,2.0,10860.0,1.0,GD_207016,1.0,2.0
74665,2430212,99.0,,1.0,,8001,,10.0,2.0,10860.0,1.0,GD_207017,1.0,2.0


In [80]:
# duplicated old_id values
sum(df_spec.groupby('old_id').count().sort_values('id', ascending=False)['id'] > 1)

9310

In [120]:
df_spec.groupby('old_id').count().sort_values('id', ascending=False)['id'].head()

old_id
GD_46893    265
GD_51776    179
GD_47257    164
GD_50012    160
GD_50013    147
Name: id, dtype: int64

In [123]:
print(df_spec[df_spec.old_id == 'GD_46893'].shape[0])
df_spec[df_spec.old_id == 'GD_46893']

265


Unnamed: 0,id,fork_length,weight,river_age,notes,sample_id,sex_id,status_id,age_type,sweep_id,life_stage_id,old_id,smart_river_age,smart_river_age_type
29833,2323031,38.0,,0.0,,5576,,10.0,2.0,5353.0,11.0,GD_46893,0.0,2.0
29834,2323032,38.0,,0.0,,5576,,10.0,2.0,5353.0,11.0,GD_46893,0.0,2.0
29835,2323033,38.0,,0.0,,5576,,10.0,2.0,5353.0,11.0,GD_46893,0.0,2.0
29836,2323034,38.0,,0.0,,5576,,10.0,2.0,5353.0,11.0,GD_46893,0.0,2.0
29837,2323035,38.0,,0.0,,5576,,10.0,2.0,5353.0,11.0,GD_46893,0.0,2.0
29838,2323036,38.0,,0.0,,5576,,10.0,2.0,5353.0,11.0,GD_46893,0.0,2.0
29839,2323037,38.0,,0.0,,5576,,10.0,2.0,5353.0,11.0,GD_46893,0.0,2.0
29840,2323038,38.0,,0.0,,5576,,10.0,2.0,5353.0,11.0,GD_46893,0.0,2.0
29841,2323039,38.0,,0.0,,5576,,10.0,2.0,5353.0,11.0,GD_46893,0.0,2.0
29842,2323040,38.0,,0.0,,5576,,10.0,2.0,5353.0,11.0,GD_46893,0.0,2.0


In [114]:
# if old_id matches, fork_length matches
temp = pd.DataFrame(df_spec.fillna(0).groupby('old_id').fork_length.nunique().eq(1))
list(temp[~temp.fork_length].index)

[]

In [116]:
# ditto weight
temp = pd.DataFrame(df_spec.fillna(0).groupby('old_id').weight.nunique().eq(1))
list(temp[~temp.weight].index)

[]

In [118]:
# if the old_id matches, everything else matches
for field in ['fork_length', 'weight', 'river_age', 'notes', 'sample_id', 'sex_id', 'status_id', 'age_type', 'sweep_id', 'life_stage_id', 'smart_river_age', 'smart_river_age_type']:
    temp = pd.DataFrame(df_spec.fillna(0).groupby('old_id')[field].nunique().eq(1))
    print(list(temp[~temp[field]].index))

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
