In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [2]:
df = pd.read_csv(r'.\provided data\Margaree Gaspereau lgth-freq_Master.csv')

In [3]:
df.dtypes

yy          int64
mm          int64
dd          int64
Time      float64
river      object
week        int64
site       object
loc        object
period     object
wt_lbs    float64
wt_kg     float64
lgth        int64
freq        int64
Flbin       int64
dtype: object

In [4]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
yy,11440.0,,,,2003.929,9.465,1990.0,1995.0,2003.0,2013.0,2019.0
mm,11440.0,,,,5.427,0.495,5.0,5.0,5.0,6.0,6.0
dd,11440.0,,,,16.41,8.882,1.0,9.0,17.0,24.0,31.0
Time,1457.0,,,,1030.411,531.799,3.0,900.0,1115.0,1425.0,1850.0
river,10222.0,3.0,SW MARGAREE,6240.0,,,,,,,
week,11440.0,,,,4.908,1.909,1.0,4.0,5.0,6.0,9.0
site,11371.0,91.0,26,2402.0,,,,,,,
loc,11358.0,4.0,LOWER,6406.0,,,,,,,
period,11440.0,3.0,AM,6143.0,,,,,,,
wt_lbs,10222.0,,,,102.868,34.969,1.0,81.7,100.0,127.0,204.0


# Datetime

In [5]:
df.Time.unique()

array([  nan,    3.,    4.,    5., 1305., 1530., 1025., 1550., 1255.,
       1405., 1455., 1310., 1220., 1115., 1640., 1355., 1320., 1400.,
       1225., 1035., 1010., 1545., 1425., 1100., 1450., 1540., 1430.,
       1030., 1345., 1630., 1050.,  850.,  925., 1445.,  940.,  900.,
        840., 1500.,  905., 1713.,  835.,  845., 1000., 1145., 1015.,
       1625., 1745., 1315., 1655., 1020., 1210., 1755., 1215., 1110.,
       1605., 1130., 1235., 1300., 1200., 1850., 1700.])

In [6]:
# convert all times to same format, set missing times to 0000 (date only), then to datetime format
df['CleanTime'] = df['Time']
df.loc[df['CleanTime'] < 10, 'CleanTime'] = df.loc[df['CleanTime'] < 10, 'CleanTime'] * 100
df['CleanTime'] = pd.to_datetime(df.CleanTime, format='%H%M')

# all null times should be 0000 (which pandas reads as no time data)
# NOTE: incorporate AM/PM column?
df.loc[df['CleanTime'].isnull(), 'CleanTime'] = pd.to_datetime(0)  

df['DATETIME'] = pd.to_datetime(dict(
    year=df['yy'], 
    month=df['mm'], 
    day=df['dd'], 
    hour=df['CleanTime'].dt.hour, 
    minute=df['CleanTime'].dt.minute
), errors='coerce')

df = df.drop('CleanTime', axis=1)

In [7]:
# percentage of null dates
sum(df.DATETIME.isnull())/df.shape[0]

0.0

In [8]:
# check to make sure hours are good
check_datetimes = pd.concat([
    df['Time'], 
    df['DATETIME'].dt.hour,
    df['DATETIME'].dt.minute
], axis=1)[~df['Time'].isnull()]

check_datetimes.columns = ['Time', 'Hour', 'Minute']

In [9]:
check_datetimes.sample(10)

Unnamed: 0,Time,Hour,Minute
6563,1010.0,10,10
1195,5.0,5,0
1076,4.0,4,0
7363,1110.0,11,10
6534,1545.0,15,45
7013,840.0,8,40
7102,1145.0,11,45
7237,1020.0,10,20
7452,1850.0,18,50
6507,1035.0,10,35


In [10]:
# the only non-matching are the fixed 3, 4, and 5 hours (not formatted consistently)
check_datetimes[check_datetimes.Time != check_datetimes.Hour * 100 + check_datetimes.Minute].describe()

Unnamed: 0,Time,Hour,Minute
count,257.0,257.0,257.0
mean,4.156,4.156,0.0
std,0.829,0.829,0.0
min,3.0,3.0,0.0
25%,3.0,3.0,0.0
50%,4.0,4.0,0.0
75%,5.0,5.0,0.0
max,5.0,5.0,0.0


In [11]:
# not null Time, but null DATETIME (coerced errors) = 0
df[(~df.Time.isnull()) & (df.DATETIME.isnull())]

Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME


In [12]:
# not null yy/mm/dd, but null DATETIME (coerced errors) = 0
df[(~df.yy.isnull()) & (df.DATETIME.isnull())].shape[0]

0

In [13]:
# all dates are spoken for, times are not always available
df[df.yy.isnull()].shape[0], df[df.mm.isnull()].shape[0], df[df.dd.isnull()].shape[0]

(0, 0, 0)

In [14]:
df.describe(datetime_is_numeric=True)

Unnamed: 0,yy,mm,dd,Time,week,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME
count,11440.0,11440.0,11440.0,1457.0,11440.0,10222.0,10222.0,11440.0,11440.0,11440.0,11440
mean,2003.929,5.427,16.41,1030.411,4.908,102.868,46.659,251.674,17.947,251.487,2004-05-03 11:22:33.991258752
min,1990.0,5.0,1.0,3.0,1.0,1.0,0.5,145.0,0.0,145.0,1990-05-07 00:00:00
25%,1995.0,5.0,9.0,900.0,4.0,81.7,37.1,235.0,3.0,235.0,1995-05-29 00:00:00
50%,2003.0,5.0,17.0,1115.0,5.0,100.0,45.4,250.0,11.0,250.0,2003-05-21 00:00:00
75%,2013.0,6.0,24.0,1425.0,6.0,127.0,57.6,265.0,29.0,265.0,2013-05-21 00:00:00
max,2019.0,6.0,31.0,1850.0,9.0,204.0,92.5,322.0,117.0,320.0,2019-06-28 00:00:00
std,9.465,0.495,8.882,531.799,1.909,34.969,15.862,21.823,18.0,21.67,


In [15]:
# Datetime looks good
df.DATETIME.sample(10)

5383    2003-06-13 00:00:00
4274    2001-05-16 00:00:00
1628    1993-05-12 00:00:00
10311   1994-05-18 00:00:00
8199    2013-06-20 00:00:00
8802    2014-06-27 00:00:00
5029    2002-06-05 00:00:00
268     1990-05-16 00:00:00
6819    2010-05-25 09:40:00
5946    2007-06-01 00:00:00
Name: DATETIME, dtype: datetime64[ns]

# River

In [16]:
# how to clean this data?
df.river.unique()

array([nan, 'MARGAREE', 'SW Margaree', 'SW MARGAREE'], dtype=object)

In [17]:
# null, not null
# ie, there are mostly Margaree river data
sum(df.river.isnull()), sum(~df.river.isnull())

(1218, 10222)

In [18]:
# is SW MARGAREE different than MARGAREE?
sum(df.river == 'MARGAREE'), sum(df.river == 'SW Margaree'), sum(df.river == 'SW MARGAREE')

(3381, 601, 6240)

In [19]:
# CONFIRM - it looks like SW is a useful distinction (past the forks)
df.loc[df.river == 'SW Margaree', 'river'] = 'SW MARGAREE'

# Site

In [20]:
# most sites are not null
sum(df.site.isnull())

69

In [21]:
sites_list = [str(i) for i in list(df.site.unique())]

In [22]:
# many entries have multiple sites
sorted(sites_list)

['1',
 '1,2',
 '1,8',
 '10',
 '12',
 '12,17',
 '12,26',
 '15',
 '15,17',
 '17',
 '17,12, 5',
 '17,26',
 '17,33',
 '17,5',
 '1A',
 '1B',
 '2',
 '2,5',
 '21',
 '23',
 '25',
 '25,26',
 '26',
 '26,1',
 '27',
 '28',
 '29',
 '29,30',
 '33',
 '34',
 '35',
 '35,37',
 '35,37,38',
 '35,41',
 '35,52',
 '37',
 '37,60',
 '38',
 '38,41',
 '38,52',
 '38,52,60',
 '4',
 '41',
 '41,49',
 '41,52,60',
 '41,60, 33',
 '41,60, 34',
 '41,60, 35',
 '41,60, 36',
 '41,60, 37',
 '41,60, 38',
 '41,60, 39',
 '41,60, 40',
 '41,60, 41',
 '41,60, 42',
 '41,60, 43',
 '47',
 '48',
 '48,38, 37',
 '49',
 '49,33',
 '49,34',
 '49,35',
 '49,36',
 '49,37',
 '49,38',
 '49,39',
 '49,40',
 '49,41',
 '5',
 '5,12',
 '5,17',
 '5,26',
 '5,8',
 '51',
 '52',
 '52,60',
 '56',
 '6',
 '60',
 '60,35',
 '60,52',
 '62',
 '64',
 '67',
 '7',
 '8',
 '9',
 'Eric McFarlane',
 'Jimmy MacFarlane',
 'John Albert Coady',
 'nan']

In [23]:
# create columns for multiple sites
# likely want to have an arbitrary number of sites to be input in dm_apps
df[['SITE1', 'SITE2', 'SITE3']] = df.site.str.replace(' ','').str.split(',', expand=True).fillna(pd.NA)

In [24]:
# how many have site data
df[['SITE1', 'SITE2', 'SITE3']].describe()

Unnamed: 0,SITE1,SITE2,SITE3
count,11371,491,90
unique,40,20,13
top,26,52,60
freq,2412,82,50


In [25]:
df.dtypes

yy                   int64
mm                   int64
dd                   int64
Time               float64
river               object
week                 int64
site                object
loc                 object
period              object
wt_lbs             float64
wt_kg              float64
lgth                 int64
freq                 int64
Flbin                int64
DATETIME    datetime64[ns]
SITE1               object
SITE2               object
SITE3               object
dtype: object

# Location

In [26]:
df['loc'].unique()

array(['LOWER', 'UPPER', 'LOWER ', nan, 'LOWE'], dtype=object)

In [27]:
# clean the typos
df.loc[df['loc'] == 'LOWER ', 'loc'] = 'LOWER'
df.loc[df['loc'] == 'LOWE', 'loc'] = 'LOWER'

In [28]:
df['loc'].unique()

array(['LOWER', 'UPPER', nan], dtype=object)

# Period

In [29]:
# AM, PM, and AD
df.period.unique()

array(['PM', 'AM', 'AD'], dtype=object)

In [30]:
# counts: AM, PM, and AD
# AD data are also included in the MASTER csv, likely not a typo (although rare)
sum(df.period == 'AM'), sum(df.period == 'PM'), sum(df.period == 'AD')

(6143, 5250, 47)

# wt_lbs and wt_kg

In [31]:
# doesn't look like any math errors
(df['wt_lbs'] / df['wt_kg']).describe()

count   10222.000
mean        2.205
std         0.004
min         2.000
25%         2.203
50%         2.205
75%         2.206
max         2.222
dtype: float64

In [32]:
# wt_lbs, wt_kg, total rows
sum(~df['wt_lbs'].isnull()), sum(~df['wt_kg'].isnull()), df.shape[0]

(10222, 10222, 11440)

In [33]:
df['wt_lbs'].describe()

count   10222.000
mean      102.868
std        34.969
min         1.000
25%        81.700
50%       100.000
75%       127.000
max       204.000
Name: wt_lbs, dtype: float64

# LGTH

In [34]:
# these look consistent
df.lgth.describe()

count   11440.000
mean      251.674
std        21.823
min       145.000
25%       235.000
50%       250.000
75%       265.000
max       322.000
Name: lgth, dtype: float64

In [35]:
# all data are non-null
sum(df.lgth.isnull())

0

# FREQ

In [36]:
# these look consistent
df.freq.describe()

count   11440.000
mean       17.947
std        18.000
min         0.000
25%         3.000
50%        11.000
75%        29.000
max       117.000
Name: freq, dtype: float64

In [37]:
# all data are non-null
sum(df.freq.isnull())

0

In [38]:
# proper integer data
df.freq.unique()

array([  1,   2,   7,   3,  13,   4,   5,   6,  14,   8,  11,  10,  12,
         9,  15,  27,  16,  19,  20,  18,  24,  21,  17,  23,  25,  28,
        30,  34,  29,  31,  22,  33,  26,  32,  37,  41,  50,  52,  39,
        35,  44,  46,  43,  90,  70,  86, 110,  54,  40,  49,  45,  71,
        42,  57,  48,  36,  64,  51,   0,  38,  47,  61,  67,  65,  59,
        55,  60,  62,  53,  66,  69,  63,  56,  80,  68,  74,  83,  72,
        77,  82,  73,  81,  76,  58,  87,  78,  88,  84,  93,  98,  95,
        75,  92,  99, 117,  79,  85,  89,  91,  94], dtype=int64)

# Flbin

In [39]:
# looks like it's just rounded bins
(df.lgth - df.Flbin).describe()

count   11440.000
mean        0.187
std         0.669
min         0.000
25%         0.000
50%         0.000
75%         0.000
max         3.000
dtype: float64

# Check the final dataset

In [40]:
df.head()

Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3
0,1990,5,7,,,2,12,LOWER,PM,,,250,1,250,1990-05-07,12,,
1,1990,5,7,,,2,12,LOWER,PM,,,253,1,250,1990-05-07,12,,
2,1990,5,7,,,2,12,LOWER,PM,,,255,2,255,1990-05-07,12,,
3,1990,5,7,,,2,12,LOWER,PM,,,258,2,255,1990-05-07,12,,
4,1990,5,7,,,2,12,LOWER,PM,,,260,7,260,1990-05-07,12,,


In [41]:
df.describe(include='all', datetime_is_numeric=True)

Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3
count,11440.0,11440.0,11440.0,1457.0,10222,11440.0,11371.0,11358,11440,10222.0,10222.0,11440.0,11440.0,11440.0,11440,11371.0,491.0,90.0
unique,,,,,2,,91.0,2,3,,,,,,,40.0,20.0,13.0
top,,,,,SW MARGAREE,,26.0,LOWER,AM,,,,,,,26.0,52.0,60.0
freq,,,,,6841,,2402.0,6488,6143,,,,,,,2412.0,82.0,50.0
mean,2003.929,5.427,16.41,1030.411,,4.908,,,,102.868,46.659,251.674,17.947,251.487,2004-05-03 11:22:33.991258752,,,
min,1990.0,5.0,1.0,3.0,,1.0,,,,1.0,0.5,145.0,0.0,145.0,1990-05-07 00:00:00,,,
25%,1995.0,5.0,9.0,900.0,,4.0,,,,81.7,37.1,235.0,3.0,235.0,1995-05-29 00:00:00,,,
50%,2003.0,5.0,17.0,1115.0,,5.0,,,,100.0,45.4,250.0,11.0,250.0,2003-05-21 00:00:00,,,
75%,2013.0,6.0,24.0,1425.0,,6.0,,,,127.0,57.6,265.0,29.0,265.0,2013-05-21 00:00:00,,,
max,2019.0,6.0,31.0,1850.0,,9.0,,,,204.0,92.5,322.0,117.0,320.0,2019-06-28 00:00:00,,,


In [42]:
df.dtypes

yy                   int64
mm                   int64
dd                   int64
Time               float64
river               object
week                 int64
site                object
loc                 object
period              object
wt_lbs             float64
wt_kg              float64
lgth                 int64
freq                 int64
Flbin                int64
DATETIME    datetime64[ns]
SITE1               object
SITE2               object
SITE3               object
dtype: object

# Save the Final Dataset

In [43]:
if False:  # change this to save
    df.to_pickle('cleaned_LENGTHFREQ_dataset.pickle')