In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML(""))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [2]:
df_LF = pd.read_csv(r'.\provided data\Margaree Gaspereau lgth-freq_Master.csv')

In [3]:
df_LF.dtypes

yy          int64
mm          int64
dd          int64
Time      float64
river      object
week        int64
site       object
loc        object
period     object
wt_lbs    float64
wt_kg     float64
lgth        int64
freq        int64
Flbin       int64
dtype: object

In [4]:
df_LF.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
yy,11440.0,,,,2003.929,9.465,1990.0,1995.0,2003.0,2013.0,2019.0
mm,11440.0,,,,5.427,0.495,5.0,5.0,5.0,6.0,6.0
dd,11440.0,,,,16.41,8.882,1.0,9.0,17.0,24.0,31.0
Time,1457.0,,,,1030.411,531.799,3.0,900.0,1115.0,1425.0,1850.0
river,10222.0,3.0,SW MARGAREE,6240.0,,,,,,,
week,11440.0,,,,4.908,1.909,1.0,4.0,5.0,6.0,9.0
site,11371.0,91.0,26,2402.0,,,,,,,
loc,11358.0,4.0,LOWER,6406.0,,,,,,,
period,11440.0,3.0,AM,6143.0,,,,,,,
wt_lbs,10222.0,,,,102.868,34.969,1.0,81.7,100.0,127.0,204.0


# Datetime

In [5]:
df_LF.Time.unique()

array([  nan,    3.,    4.,    5., 1305., 1530., 1025., 1550., 1255.,
       1405., 1455., 1310., 1220., 1115., 1640., 1355., 1320., 1400.,
       1225., 1035., 1010., 1545., 1425., 1100., 1450., 1540., 1430.,
       1030., 1345., 1630., 1050.,  850.,  925., 1445.,  940.,  900.,
        840., 1500.,  905., 1713.,  835.,  845., 1000., 1145., 1015.,
       1625., 1745., 1315., 1655., 1020., 1210., 1755., 1215., 1110.,
       1605., 1130., 1235., 1300., 1200., 1850., 1700.])

In [14]:
# convert all times to same format, set missing times to 0000 (date only), then to datetime format
df_LF['CleanTime'] = df_LF['Time']
df_LF.loc[df_LF['CleanTime'] < 10, 'CleanTime'] = df_LF.loc[df_LF['CleanTime'] < 10, 'CleanTime'] * 100
df_LF['CleanTime'] = pd.to_datetime(df_LF.CleanTime, format='%H%M')

# all null times should be 0000 (which pandas reads as no time data)
# NOTE: incorporate AM/PM column?
df_LF.loc[df_LF['CleanTime'].isnull(), 'CleanTime'] = pd.to_datetime(0)  

df_LF['DATETIME'] = pd.to_datetime(dict(
    year=df_LF['yy'], 
    month=df_LF['mm'], 
    day=df_LF['dd'], 
    hour=df_LF['CleanTime'].dt.hour, 
    minute=df_LF['CleanTime'].dt.minute
), errors='coerce')

df_LF = df_LF.drop('CleanTime', axis=1)

In [15]:
# percentage of null dates
sum(df_LF.DATETIME.isnull())/df_LF.shape[0]

0.0

In [16]:
# check to make sure hours are good
check_datetimes = pd.concat([
    df_LF['Time'], 
    df_LF['DATETIME'].dt.hour,
    df_LF['DATETIME'].dt.minute
], axis=1)[~df_LF['Time'].isnull()]

check_datetimes.columns = ['Time', 'Hour', 'Minute']

In [22]:
check_datetimes.sample(10)

Unnamed: 0,Time,Hour,Minute
6362,1455.0,14,55
7004,835.0,8,35
1009,3.0,3,0
6961,850.0,8,50
7392,1130.0,11,30
6296,1305.0,13,5
6855,840.0,8,40
6405,1220.0,12,20
1111,5.0,5,0
6483,1400.0,14,0


In [23]:
# the only non-matching are the fixed 3, 4, and 5 hours (not formatted consistently)
check_datetimes[check_datetimes.Time != check_datetimes.Hour * 100 + check_datetimes.Minute].describe()

Unnamed: 0,Time,Hour,Minute
count,257.0,257.0,257.0
mean,4.156,4.156,0.0
std,0.829,0.829,0.0
min,3.0,3.0,0.0
25%,3.0,3.0,0.0
50%,4.0,4.0,0.0
75%,5.0,5.0,0.0
max,5.0,5.0,0.0


In [24]:
# not null Time, but null DATETIME (coerced errors) = 0
df_LF[(~df_LF.Time.isnull()) & (df_LF.DATETIME.isnull())]

Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME


In [25]:
# not null yy/mm/dd, but null DATETIME (coerced errors) = 0
df_LF[(~df_LF.yy.isnull()) & (df_LF.DATETIME.isnull())].shape

(0, 15)

In [26]:
# all dates are spoken for, times are not always available
df_LF[df_LF.yy.isnull()].shape[0], df_LF[df_LF.mm.isnull()].shape[0], df_LF[df_LF.dd.isnull()].shape[0]

(0, 0, 0)

In [28]:
df_LF.describe(datetime_is_numeric=True)

Unnamed: 0,yy,mm,dd,Time,week,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME
count,11440.0,11440.0,11440.0,1457.0,11440.0,10222.0,10222.0,11440.0,11440.0,11440.0,11440
mean,2003.929,5.427,16.41,1030.411,4.908,102.868,46.659,251.674,17.947,251.487,2004-05-03 11:22:33.991258752
min,1990.0,5.0,1.0,3.0,1.0,1.0,0.5,145.0,0.0,145.0,1990-05-07 00:00:00
25%,1995.0,5.0,9.0,900.0,4.0,81.7,37.1,235.0,3.0,235.0,1995-05-29 00:00:00
50%,2003.0,5.0,17.0,1115.0,5.0,100.0,45.4,250.0,11.0,250.0,2003-05-21 00:00:00
75%,2013.0,6.0,24.0,1425.0,6.0,127.0,57.6,265.0,29.0,265.0,2013-05-21 00:00:00
max,2019.0,6.0,31.0,1850.0,9.0,204.0,92.5,322.0,117.0,320.0,2019-06-28 00:00:00
std,9.465,0.495,8.882,531.799,1.909,34.969,15.862,21.823,18.0,21.67,


In [55]:
# Datetime looks good
df_LF.DATETIME.sample(10)

462     1990-05-24 00:00:00
6405    2009-05-26 12:20:00
780     1990-06-03 00:00:00
4111    2000-06-01 00:00:00
4965    2002-05-29 00:00:00
11363   2019-06-25 00:00:00
1788    1993-05-29 00:00:00
4236    2000-06-10 00:00:00
5426    2003-06-21 00:00:00
8503    2014-05-28 00:00:00
Name: DATETIME, dtype: datetime64[ns]

# River

In [57]:
# how to clean this data?
df_LF.river.unique()

array([nan, 'MARGAREE', 'SW Margaree', 'SW MARGAREE'], dtype=object)

In [61]:
# null, not null
# ie, there are mostly Margaree river data
sum(df_LF.river.isnull()), sum(~df_LF.river.isnull())

(1218, 10222)

In [62]:
# is SW MARGAREE different than MARGAREE?
sum(df_LF.river == 'MARGAREE'), sum(df_LF.river == 'SW Margaree'), sum(df_LF.river == 'SW MARGAREE')

(3381, 601, 6240)

# Site

In [76]:
sites_list = [str(i) for i in list(df_LF.site.unique())]

In [79]:
# most sites are not null
sum(df_LF.site.isnull())

69

In [77]:
# many entries have multiple sites
sorted(sites_list)

['1',
 '1,2',
 '1,8',
 '10',
 '12',
 '12,17',
 '12,26',
 '15',
 '15,17',
 '17',
 '17,12, 5',
 '17,26',
 '17,33',
 '17,5',
 '1A',
 '1B',
 '2',
 '2,5',
 '21',
 '23',
 '25',
 '25,26',
 '26',
 '26,1',
 '27',
 '28',
 '29',
 '29,30',
 '33',
 '34',
 '35',
 '35,37',
 '35,37,38',
 '35,41',
 '35,52',
 '37',
 '37,60',
 '38',
 '38,41',
 '38,52',
 '38,52,60',
 '4',
 '41',
 '41,49',
 '41,52,60',
 '41,60, 33',
 '41,60, 34',
 '41,60, 35',
 '41,60, 36',
 '41,60, 37',
 '41,60, 38',
 '41,60, 39',
 '41,60, 40',
 '41,60, 41',
 '41,60, 42',
 '41,60, 43',
 '47',
 '48',
 '48,38, 37',
 '49',
 '49,33',
 '49,34',
 '49,35',
 '49,36',
 '49,37',
 '49,38',
 '49,39',
 '49,40',
 '49,41',
 '5',
 '5,12',
 '5,17',
 '5,26',
 '5,8',
 '51',
 '52',
 '52,60',
 '56',
 '6',
 '60',
 '60,35',
 '60,52',
 '62',
 '64',
 '67',
 '7',
 '8',
 '9',
 'Eric McFarlane',
 'Jimmy MacFarlane',
 'John Albert Coady',
 'nan']

In [207]:
# create columns for multiple sites
# likely want to have a variable number of sites to be input in django
df_LF[['SITE1', 'SITE2', 'SITE3']] = df_LF.site.str.split(',', expand=True).fillna(pd.NA)

In [217]:
# how many have site data
df_LF[['SITE1', 'SITE2', 'SITE3']].describe()

Unnamed: 0,SITE1,SITE2,SITE3
count,11371,491,90
unique,40,20,14
top,26,52,60
freq,2412,82,50


# Location

In [222]:
df_LF['loc'].unique()

array(['LOWER', 'UPPER', 'LOWER ', nan, 'LOWE'], dtype=object)

In [223]:
# clean the typos
df_LF.loc[df_LF['loc'] == 'LOWER ', 'loc'] = 'LOWER'
df_LF.loc[df_LF['loc'] == 'LOWE', 'loc'] = 'LOWER'

In [224]:
df_LF['loc'].unique()

array(['LOWER', 'UPPER', nan], dtype=object)

# Period

In [229]:
# AM, PM, and AD
df_LF.period.unique()

array(['PM', 'AM', 'AD'], dtype=object)