In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pickle
sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# no text wrapping
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 600)
pd.set_option('display.max_colwidth', 200)

In [2]:
# base dataframes
df_hist = pd.read_pickle('calculations\df_hist.pickle')
df_spec = pd.read_pickle('calculations\df_spec.pickle')
df_arch = pd.read_pickle('calculations\df_arch.pickle')
df_prob = pd.read_pickle('calculations\df_prob.pickle')
df_matching_stats = pd.read_pickle('calculations\df_matching_stats.pickle')

# need to recreate summary with updated calculations
# define df_summary
df_summary = pd.merge(
    df_hist[['sample_id', 'id']].groupby('sample_id').count().rename({'id':'n_hist'}, axis=1),
    df_spec[['sample_id', 'id']].groupby('sample_id').count().rename({'id':'n_spec'}, axis=1),
    on='sample_id',
    how='left'
).fillna(0)
df_summary.n_spec = df_summary.n_spec.astype(int)

# improved bins, plotting and error checking

In [3]:
# bins used by specimen table
# looks like bio data doesn't need to end with a 3 or 8
(df_spec.fork_length % 10).value_counts()

3    38505
8    34192
0      242
9      231
4      228
2      225
5      224
1      207
7      206
6      187
Name: fork_length, dtype: Int64

In [4]:
df_spec.notna().sum()
# note: there are only 6 sex data and they are the import error (should be bio) from the other notebook

id                      74666
fork_length             74447
weight                    429
river_age               74664
notes                   74666
sample_id               74666
sex_id                      6
status_id               74666
age_type                74664
sweep_id                74666
life_stage_id           74666
old_id                  74666
smart_river_age         74664
smart_river_age_type    74664
matching_id             74666
dtype: int64

In [5]:
# check: confirm that values ending in not 3/8 have more weight/sex

# still no sex data, but most of the weight data is here 
# (you would expect exactly 80% if it was 100% because more detailed measurements could include n%5==3)

df_spec[df_spec.fork_length % 5 != 3][['weight', 'sex_id']].notna().sum() / df_spec[['weight', 'sex_id']].notna().sum()

weight   0.841
sex_id   0.000
dtype: float64

In [6]:
# bins go from 23 to 163
df_spec.fork_length.describe()

count   74447.000
mean       57.040
std        23.221
min        23.000
25%        43.000
50%        48.000
75%        68.000
max       163.000
Name: fork_length, dtype: float64

# improved binning and error calculating
using bins as utilised by the specimen table

In [7]:
%%time
# loop through ascending sort first - pick best match out of either asc or desc sort loop method

weight_tolerance = 1
potential_fish_matches = []  # list(sample, spec, hist, hist_total) - these should only trigger if an exact match on sex/len/wt within tolerance, if exists
strong_sample_matches = list()  # a match is found for every fish in df_hist - sample likely contains duplicated spec/bio
bad_sample_matches = set()  # df_hist contains unmatchable fish - some fish are definitely not duplicated spec/bio
last_sample = 0
df = pd.DataFrame()
hist_total, hist_matches = 999, 0 

for i, row in df_hist.sort_values(['sample_id', 'id']).iterrows():

    fish_id, sample_id, fork_length, weight, sex_id = row[['id', 'sample_id', 'fork_length', 'weight', 'sex_id']]
    current_bin = fork_length - fork_length%5, fork_length - fork_length%5 + 5  # these are int bins n%5, could add 0.5 per above note
    
    if last_sample != sample_id:
        df = df_spec[df_spec.sample_id==sample_id]
        # strong matches
        if hist_matches == hist_total:
            strong_sample_matches += [last_sample]
        hist_matches = 0
        hist_total = df_hist[df_hist.sample_id==sample_id].shape[0]
        
    if not df.empty:
        
        results = df[
            ((df.fork_length>=current_bin[0]) & (df.fork_length<current_bin[1])) # check if fork_length is in the same bin
            & (
                ((df.weight>=weight-weight_tolerance) & (df.weight>=weight-weight_tolerance))
                | df.weight.isnull()
            )
            & ((df.sex_id==sex_id) | df.sex_id.isnull())
        ]
        if not results.empty:
            hist_matches += 1
            potential_fish_matches += [[sample_id, fish_id, results.iloc[0].id, hist_total, hist_matches, fork_length, results.iloc[[0]].fork_length.values[0]]]
            df = df.drop(results.iloc[[0]].index[0]) # drop this row so it doesn't get matched again
        else:
            bad_sample_matches.add(sample_id)  # triggers if results is empty (there are no matches)

    else:
        bad_sample_matches.add(sample_id)  # triggers if df is empty

    last_sample = sample_id
    
    
# use potential fish matches to calculate error
error_penalty_per_unmatched_fish = 100  # arbitrary

df_matches_asc = pd.DataFrame(potential_fish_matches, columns=['sample_id', 'hist_id', 'spec_id', 'total_hist', 'cumulative_matches', 'hist_fork_length', 'spec_fork_length'])
df_match_counts = df_matches_asc.groupby('sample_id').max()[['cumulative_matches', 'total_hist']].rename({'total_hist':'total', 'cumulative_matches':'matches'}, axis=1)

df_matches_asc = pd.merge(
    df_matches_asc,
    df_match_counts,
    on='sample_id',
    how='left'
).drop(['cumulative_matches', 'total_hist'], axis=1)

df_matches_asc['fish_sq_error'] = (df_matches_asc['hist_fork_length'] - df_matches_asc['spec_fork_length']) ** 2
df_matches_asc['unmatched_penalty'] = (df_matches_asc['total'] - df_matches_asc['matches']) * error_penalty_per_unmatched_fish
df_matches_asc = df_matches_asc.merge(
    pd.DataFrame(df_matches_asc[['sample_id', 'fish_sq_error', 'unmatched_penalty']].groupby('sample_id').agg({'fish_sq_error':'sum', 'unmatched_penalty':'max'}).sum(axis=1), columns=['sample_SSE']),
    on='sample_id',
    how='left'
).drop(['fish_sq_error', 'unmatched_penalty'], axis=1)

Wall time: 56 s


In [8]:
%%time
# same calc for descending sort - pick best match out of either method

weight_tolerance = 1
potential_fish_matches = []  # list(sample, spec, hist, hist_total) - these should only trigger if an exact match on sex/len/wt within tolerance, if exists
strong_sample_matches = list()  # a match is found for every fish in df_hist - sample likely contains duplicated spec/bio
bad_sample_matches = set()  # df_hist contains unmatchable fish - some fish are definitely not duplicated spec/bio
last_sample = 0
df = pd.DataFrame()
hist_total, hist_matches = 999, 0 

for i, row in df_hist.sort_values(['sample_id', 'id'], ascending=False).iterrows():

    fish_id, sample_id, fork_length, weight, sex_id = row[['id', 'sample_id', 'fork_length', 'weight', 'sex_id']]
    current_bin = fork_length - fork_length%5, fork_length - fork_length%5 + 5  # these are int bins n%5, could add 0.5 per above note
    
    if last_sample != sample_id:
        df = df_spec[df_spec.sample_id==sample_id]
        # strong matches
        if hist_matches == hist_total:
            strong_sample_matches += [last_sample]
        hist_matches = 0
        hist_total = df_hist[df_hist.sample_id==sample_id].shape[0]
        
    if not df.empty:
        
        results = df[
            ((df.fork_length>=current_bin[0]) & (df.fork_length<current_bin[1])) # check if fork_length is in the same bin
            & (
                ((df.weight>=weight-weight_tolerance) & (df.weight>=weight-weight_tolerance))
                | df.weight.isnull()
            )
            & ((df.sex_id==sex_id) | df.sex_id.isnull())
        ]
        if not results.empty:
            hist_matches += 1
            potential_fish_matches += [[sample_id, fish_id, results.iloc[0].id, hist_total, hist_matches, fork_length, results.iloc[[0]].fork_length.values[0]]]
            df = df.drop(results.iloc[[0]].index[0]) # drop this row so it doesn't get matched again
        else:
            bad_sample_matches.add(sample_id)  # triggers if results is empty (there are no matches)

    else:
        bad_sample_matches.add(sample_id)  # triggers if df is empty

    last_sample = sample_id
    
    
# use potential fish matches to calculate error
error_penalty_per_unmatched_fish = 100  # arbitrary

df_matches_desc = pd.DataFrame(potential_fish_matches, columns=['sample_id', 'hist_id', 'spec_id', 'total_hist', 'cumulative_matches', 'hist_fork_length', 'spec_fork_length'])
df_match_counts = df_matches_desc.groupby('sample_id').max()[['cumulative_matches', 'total_hist']].rename({'total_hist':'total', 'cumulative_matches':'matches'}, axis=1)

df_matches_desc = pd.merge(
    df_matches_desc,
    df_match_counts,
    on='sample_id',
    how='left'
).drop(['cumulative_matches', 'total_hist'], axis=1)

df_matches_desc['fish_sq_error'] = (df_matches_desc['hist_fork_length'] - df_matches_desc['spec_fork_length']) ** 2
df_matches_desc['unmatched_penalty'] = (df_matches_desc['total'] - df_matches_desc['matches']) * error_penalty_per_unmatched_fish
df_matches_desc = df_matches_desc.merge(
    pd.DataFrame(df_matches_desc[['sample_id', 'fish_sq_error', 'unmatched_penalty']].groupby('sample_id').agg({'fish_sq_error':'sum', 'unmatched_penalty':'max'}).sum(axis=1), columns=['sample_SSE']),
    on='sample_id',
    how='left'
).drop(['fish_sq_error', 'unmatched_penalty'], axis=1)

Wall time: 55.3 s


In [9]:
# these are different enough that we should combine the results
SSE_comparison = pd.merge(
    df_matches_asc.groupby('sample_id').max()['sample_SSE'].reset_index().rename({'sample_SSE':'SSE_asc'}, axis=1),
    df_matches_desc.groupby('sample_id').max()['sample_SSE'].reset_index().rename({'sample_SSE':'SSE_desc'}, axis=1),
    on='sample_id'
)
SSE_comparison['delta'] = SSE_comparison['SSE_asc'] - SSE_comparison['SSE_desc']
SSE_comparison['delta_scaled'] = (SSE_comparison['delta'] / ((SSE_comparison['SSE_asc'] + SSE_comparison['SSE_desc']) / 2)).abs().fillna(0)
SSE_comparison.sort_values('delta_scaled', ascending=False).head(22)
SSE_comparison.describe(percentiles=[0.95, 0.975,0.995])

Unnamed: 0,sample_id,SSE_asc,SSE_desc,delta,delta_scaled
count,768.0,768.0,768.0,768.0,768.0
mean,6406.895,535.06,527.673,7.387,0.061
std,1345.255,773.377,738.046,67.11,0.2
min,4404.0,0.0,0.0,-614.0,0.0
50%,7169.5,274.0,302.5,4.0,0.016
95%,7862.65,1879.3,1789.7,81.0,0.19
97.5%,7928.825,2988.475,2824.775,122.775,0.769
99.5%,7990.165,4289.61,4099.05,232.31,1.429
max,8001.0,6865.0,6548.0,336.0,1.646


In [10]:
# combine lowest errors by sample number
id_asc = SSE_comparison[SSE_comparison.SSE_asc<=SSE_comparison.SSE_desc].sample_id
id_desc = SSE_comparison[SSE_comparison.SSE_asc>SSE_comparison.SSE_desc].sample_id

df_matches = pd.concat([
    df_matches_asc[df_matches_asc.sample_id.isin(id_asc)],
    df_matches_desc[df_matches_desc.sample_id.isin(id_desc)]
]).sort_values(['sample_id', 'hist_id']).reset_index(drop=True)

In [11]:
# summary of findings
df_matches['MSE'] = df_matches['sample_SSE'] / df_matches['total']
df_matches['matched_proportion'] = df_matches['matches'] / df_matches['total']
df_match_error_summary = df_matches.groupby('sample_id').max().sort_values('sample_SSE', ascending=False)[['matched_proportion', 'sample_SSE', 'MSE']]

# merge into summary
df_summary = df_summary.merge(df_match_error_summary, on='sample_id', how='left')
df_summary.loc[df_summary.matched_proportion.isnull() | df_summary.sample_SSE.isnull() | df_summary.MSE.isnull(), ['matched_proportion', 'sample_SSE', 'MSE']] = 0, 9999, 9999

df_summary.describe(percentiles=[0.01, 0.1, 0.9, 0.99]).drop('count')

Unnamed: 0,n_hist,n_spec,matched_proportion,sample_SSE,MSE
mean,34.709,94.156,0.841,816.43,330.385
std,39.659,109.54,0.217,1810.935,1745.597
min,1.0,0.0,0.0,0.0,0.0
1%,1.0,0.0,0.0,1.0,0.5
10%,5.0,8.0,0.611,19.0,2.388
50%,25.0,61.0,0.905,296.0,12.014
90%,78.0,209.8,1.0,1508.4,41.39
99%,184.4,491.72,1.0,9999.0,9999.0
max,346.0,1016.0,1.0,9999.0,9999.0


# Save Summary File

In [12]:
df_summary.to_pickle('calculations\df_summary.pickle')