In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [3]:
dtypes = {
    'code': 'object',
    'no_nets': 'object',
    'YEAR': 'Int32',
    'MM': 'Int32',
    'DD': 'Int32',
    'Week': 'Int32',
#     'hours_fished': 'Int32'  # some entries say 'maximum '
}
df_SD = pd.read_csv(r'.\provided data\Margaree Gaspereau logbooks_Master.csv', dtype=dtypes)

In [4]:
df_SD.dtypes

DIST               int64
RIVER             object
NAME              object
code              object
GEAR               int64
SITE_NO           object
no_nets           object
YEAR               Int32
MM                 Int32
DD                 Int32
Week               Int32
catch_lbs        float64
catch_kg         float64
hours_fished      object
zone              object
last_name         object
comments          object
bycatch_sbass    float64
bycatch_shad     float64
bycatch_other     object
dtype: object

In [5]:
df_SD.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
DIST,15213.0,,,,2.0,0.0,2.0,2.0,2.0,2.0,2.0
RIVER,15213.0,1.0,SWMARG,15213.0,,,,,,,
NAME,15213.0,76.0,Martin E Cameron,1209.0,,,,,,,
code,0.0,0.0,,,,,,,,,
GEAR,15213.0,,,,81.0,0.0,81.0,81.0,81.0,81.0,81.0
SITE_NO,15213.0,62.0,12,1209.0,,,,,,,
no_nets,9623.0,2.0,1,9574.0,,,,,,,
YEAR,15213.0,,,,1999.845,10.756,1983.0,1990.0,2000.0,2009.0,2019.0
MM,15194.0,,,,5.371,0.493,4.0,5.0,5.0,6.0,6.0
DD,15182.0,,,,16.344,8.739,1.0,9.0,17.0,24.0,31.0


# Dist

In [6]:
# only 1 value: 2
df_SD.DIST.unique(), df_SD.DIST.nunique()

(array([2], dtype=int64), 1)

# River

In [7]:
# only 1 value: 'SWMARG'
df_SD.RIVER.unique()

array(['SWMARG'], dtype=object)

# Name

In [8]:
# this list could likely be cleaned, but need more info on name spelling
sorted(list(df_SD.NAME.unique()))

['Alexander Gillis',
 'Alexander MacDonald',
 'Allan B Gillis',
 'Anthony Cameron',
 'Brian Doyle',
 'Brian MacFarlane',
 'Bruce MacLellan',
 'Bruce McLellan',
 'Catherine MacFarlane',
 'Catherine MacLeod',
 'Charles McDaniel',
 'Chris MacLean',
 'Colin Gillis',
 'Daniel Stewart',
 'Darlene Cameron',
 'David MacKinnon',
 'Donald D Gillis',
 'Donald J Gillis',
 'Donald M Campbell',
 'Donald MacEachern',
 'Donald MacLeod',
 'Donald Macleod',
 'Donelda M Gillis',
 'Eleanor MacLellan',
 'Eleanor McDaniel',
 'Elizabeth MacKinnon',
 'Elizabeth/Vincent MacKinnon',
 'Finley MacDonald',
 'Finley Stewart',
 'Florence Gillis',
 'Fred Ingram',
 'Gerard MacFarlane',
 'Gerard V Chiasson',
 'Harold MacFarlane',
 'Hugh J Gillis',
 'Hughie MacDonnell',
 'Irene MacIsaac',
 'Jack MacLellan',
 'James A Hirtle',
 'James MacFarlane',
 'Jim Coady',
 'Jim MacFarlane',
 'Joan Ingram',
 'John A Chisholm',
 'John A Coady',
 'John Coady',
 'John H Gillis',
 'John MacLellan',
 'John Neil Gillis',
 'John R Gillis',

# code

In [9]:
# nothing in this column
df_SD.code.unique()

array([nan], dtype=object)

# GEAR

In [10]:
# only 1 value: 'SWMARG'
df_SD.GEAR.unique()

array([81], dtype=int64)

# SITE_NO

In [11]:
# only one of these need to be split between columns
sorted(list(df_SD.SITE_NO.unique()))

['1',
 '11',
 '12',
 '15',
 '17',
 '18',
 '19',
 '1A',
 '1A,8',
 '1B',
 '2',
 '20',
 '21',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '37',
 '38',
 '39',
 '4',
 '40',
 '41',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '5',
 '50',
 '51',
 '52',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '6',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '7',
 '8',
 '9']

In [12]:
# create columns for multiple sites
# likely want to have an arbitrary number of sites to be input in dm_apps
df_SD[['SITE1', 'SITE2']] = df_SD.SITE_NO.str.split(',', expand=True).fillna(pd.NA)

In [13]:
df_SD[['SITE1', 'SITE2']].describe()

Unnamed: 0,SITE1,SITE2
count,15213,23
unique,61,1
top,12,8
freq,1209,23


# no_nets

In [14]:
df_SD.no_nets.unique()

array([nan, '1', '2'], dtype=object)

In [15]:
df_SD.no_nets.dtype

dtype('O')

# Datetime

In [16]:
df_SD[['YEAR', 'MM', 'DD']].describe()

Unnamed: 0,YEAR,MM,DD
count,15213.0,15194.0,15182.0
mean,1999.845,5.371,16.344
std,10.756,0.493,8.739
min,1983.0,4.0,1.0
25%,1990.0,5.0,9.0
50%,2000.0,5.0,17.0
75%,2009.0,6.0,24.0
max,2019.0,6.0,31.0


In [17]:
# missing year, month, day
df_SD[df_SD.YEAR.isnull()].shape[0], df_SD[df_SD.MM.isnull()].shape[0], df_SD[df_SD.DD.isnull()].shape[0]

(0, 19, 31)

In [18]:
df_SD['DATETIME'] = pd.to_datetime(
    df_SD['YEAR'].astype(str) + '-' + df_SD['MM'].astype(str) + '-' + df_SD['DD'].astype(str), 
    errors='coerce'  # coerce missing days and months into errors
)

In [19]:
# any entries with missing month or day can't get an exact datetime
sum(df_SD.DATETIME.isnull())

31

# catch lb kg

In [20]:
# looks mostly correct, except missing values, and potential rounding errors
df_SD['TEMP'] = df_SD.catch_lbs / df_SD.catch_kg

In [21]:
# OK: rounding errors
df_SD.sort_values('TEMP').head(50)[['catch_lbs', 'catch_kg', 'TEMP']]

Unnamed: 0,catch_lbs,catch_kg,TEMP
11756,1.0,0.5,2.0
10813,1.0,0.5,2.0
6077,1.0,0.5,2.0
13284,1.0,0.5,2.0
756,1.0,0.5,2.0
757,1.0,0.5,2.0
12560,1.0,0.5,2.0
758,1.0,0.5,2.0
8270,1.0,0.5,2.0
6068,1.0,0.5,2.0


In [22]:
# OK: math NA ~ catch null
df_SD[df_SD['TEMP'].isnull()][['catch_lbs', 'catch_kg', 'TEMP']].describe()

Unnamed: 0,catch_lbs,catch_kg,TEMP
count,2111.0,2123.0,0.0
mean,0.0,0.0,
std,0.0,0.0,
min,0.0,0.0,
25%,0.0,0.0,
50%,0.0,0.0,
75%,0.0,0.0,
max,0.0,0.0,


In [23]:
# OK: TINY lb catch, missing kg, rounds to zero
df_SD[(df_SD['TEMP'].notnull()) & (df_SD['TEMP'] > 2.3)][['catch_lbs', 'catch_kg', 'TEMP']]

Unnamed: 0,catch_lbs,catch_kg,TEMP
13979,0.11,0.0,inf


In [24]:
# done with TEMP, drop the row
df_SD = df_SD.drop('TEMP', axis=1)

# hours_fished

In [25]:
sorted([str(i) for i in df_SD.hours_fished.unique()])

['0',
 '0.5',
 '1',
 '1.5',
 '1.8',
 '10',
 '10.5',
 '11',
 '11.5',
 '12',
 '12.5',
 '13',
 '13.5',
 '14',
 '14.5',
 '15',
 '15.5',
 '16',
 '16.5',
 '17',
 '18',
 '2',
 '2.5',
 '3',
 '3.5',
 '4',
 '4.5',
 '5',
 '5.5',
 '6',
 '6.5',
 '7',
 '7.5',
 '8',
 '8.5',
 '9',
 '9.5',
 'maximum ',
 'nan']

In [26]:
# 28 entries have value of 'maximum ' (with a space)
# is this a number? 18? 24?
# leave for now
sum(df_SD.hours_fished == 'maximum ')

28

In [27]:
# number of null hours fished entries
sum(df_SD.hours_fished.isnull())

1679

# zone

In [28]:
df_SD.zone.unique()

array(['lower', 'upper', nan, 'upper '], dtype=object)

In [29]:
df_SD.loc[df_SD.zone == 'upper ', 'zone'] = 'upper'

In [30]:
df_SD.zone.unique()

array(['lower', 'upper', nan], dtype=object)

# last_name

In [31]:
# could be cleaned slightly, but need spellings for names confirmed to do a perfect job
# leave for now
sorted([str(i) for i in df_SD.last_name.unique()])

['CHISHOLM',
 'Cameron',
 'Campbell',
 'Chiasson',
 'Chisholm',
 'Coady',
 'Doyle',
 'Fraser',
 'Gillis',
 'Hirtle',
 'Ingram',
 'MACKINNON',
 'MACLELLAN',
 'MacDonald',
 'MacDonnell',
 'MacEachern',
 'MacFarlane',
 'MacIsaac',
 'MacKinnon',
 'MacLean',
 'MacLellan',
 'MacLeod',
 'MacNeil',
 'Mackinnon',
 'Macleod',
 'McDaniel',
 'McLellan',
 'Peters',
 'STEWART',
 'Stewart',
 'nan']

# comments

In [32]:
df_SD.comments.unique()

array([nan, 'striped bass were 23 inches',
       'river still high, fishing inside wing',
       'catch a salmon no marking 14 inch long', 'salted 4800 lbs',
       'salted 5000 lbs', 'outside wing in, river high',
       'salted 8000 lbs', '5 gaspereau', '26 gaspereau', 'did not fish',
       'Rainbow trout released', 'salmon', 'trout', 'salt', 'bait',
       'did not fish, returned logbook', 'wet snow', '6 fish',
       'heavy rain, flood', '3 fish', '10 fish', '1 fish', '8 fish',
       'heavy rain', 'did not fish on the 22nd. Water too high.'],
      dtype=object)

# bycatch
* maybe a better way to structure this data (leave for now)

In [33]:
df_SD.bycatch_sbass.unique()

array([nan,  3.,  1.,  2.])

In [34]:
df_SD.bycatch_shad.unique()

array([nan,  1.])

In [35]:
df_SD.bycatch_other.unique()

array([nan, 'salmon', '1 perch, 1 speckled trou, 1 30 inch salmon',
       '1 lamprey eel (?)', '1', '2 trout', '1 trout', '5 lampreys',
       '1 baby lamprey', '4 trout, 3 "lamprey eels" (?)', '1 perch',
       '4 suckers', 'lamprey', '1 sucker'], dtype=object)

# check the final dataset

In [36]:
df_SD.dtypes

DIST                      int64
RIVER                    object
NAME                     object
code                     object
GEAR                      int64
SITE_NO                  object
no_nets                  object
YEAR                      Int32
MM                        Int32
DD                        Int32
Week                      Int32
catch_lbs               float64
catch_kg                float64
hours_fished             object
zone                     object
last_name                object
comments                 object
bycatch_sbass           float64
bycatch_shad            float64
bycatch_other            object
SITE1                    object
SITE2                    object
DATETIME         datetime64[ns]
dtype: object

In [37]:
df_SD.describe(include='all', datetime_is_numeric=True).T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
DIST,15213.0,,,,2.000,2.000,2.000,2.000,2.000,2.000,0.0
RIVER,15213.0,1.0,SWMARG,15213.0,,,,,,,
NAME,15213.0,76.0,Martin E Cameron,1209.0,,,,,,,
code,0.0,0.0,,,,,,,,,
GEAR,15213.0,,,,81.000,81.000,81.000,81.000,81.000,81.000,0.0
SITE_NO,15213.0,62.0,12,1209.0,,,,,,,
no_nets,9623.0,2.0,1,9574.0,,,,,,,
YEAR,15213.0,,,,1999.845,1983.000,1990.000,2000.000,2009.000,2019.000,10.756
MM,15194.0,,,,5.371,4.000,5.000,5.000,6.000,6.000,0.493
DD,15182.0,,,,16.344,1.000,9.000,17.000,24.000,31.000,8.739


# Save the Final Dataset

In [38]:
if False:  # change this to save
    df_SD.to_pickle('cleaned_LOGBOOK_dataset.pickle')