# import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# import and clean data

In [2]:
dtypes = {
    'YEAR': 'Int32', 
    'MM': 'Int32', 
    'DD': 'Int32', 
    'WEEK': 'Int32', 
    'SITE': 'object', 
    'PERIOD': 'object', 
    'CONDITION': 'object', 
    'FISH_NO': 'Int32', 
    'FL_WET.x': 'object',  # some 'NA', convert to numeric, coerce error to make 'NA' actually null
    'FL_FROZEN.x': 'object',  # some 'NA', convert to numeric, coerce error to make 'NA' actually null
    'FL_STD.x': 'float64',
    'WEIGHT': 'float64',
    'SPECIES': 'object', 
    'SEX': 'object', 
    'MATURITY.x': 'object',  # two entries that at 3?, (also 1 that is 44), convert to numeric, coerce errors
    'GONAD_WEIGHT': 'object',  # some '.' / 'NA', convert to numeric, coerce error to make 'NA' actually null
    'Ager.1': 'object',
    'AGE.1': 'object',  # convert to numeric and separate text comments
    'FSP.1': 'object',  # convert to numeric and separate text comments
    'Comments.1': 'object', 
    'Ager.2': 'object',  
    'AGE.2': 'object',  # convert to numeric and separate text comments
    'FSP.2': 'object',  # convert to numeric and separate text comments
    'Comments.2': 'object', 
    'Ager.3': 'object', 
    'AGE.3': 'object',  # convert to numeric and separate text comments
    'FSP.3': 'object',  # convert to numeric and separate text comments
    'Comments.3': 'object', 
    'Envelop Comments': 'object'
} 

df_FD = pd.read_excel('Gaspereau Ages_Combined.xlsx', dtype=dtypes)

In [3]:
# create backup of notes (including all numeric data for simplicity)
df_FD['AGE_notes_1'] = df_FD['AGE.1']
df_FD['FSP_notes_1'] = df_FD['FSP.1']
df_FD['AGE_notes_2'] = df_FD['AGE.2']
df_FD['FSP_notes_2'] = df_FD['FSP.2']
df_FD['AGE_notes_3'] = df_FD['AGE.3']
df_FD['FSP_notes_3'] = df_FD['FSP.3']

# convert to numeric and coerce errors
# for FL_WET.x, one measurement is a float (258.8), all else are int, need to round to convert to int
df_FD['FL_WET.x'] = np.round(pd.to_numeric(df_FD['FL_WET.x'], errors='coerce')).astype('Int32')
df_FD['FL_FROZEN.x'] = pd.to_numeric(df_FD['FL_FROZEN.x'], errors='coerce').astype('Int32')
df_FD['MATURITY.x'] = pd.to_numeric(df_FD['MATURITY.x'], errors='coerce').astype('Int32')
df_FD['GONAD_WEIGHT'] = pd.to_numeric(df_FD['GONAD_WEIGHT'], errors='coerce')  # float
df_FD['AGE.1'] = pd.to_numeric(df_FD['AGE.1'], errors='coerce').astype('Int32')
df_FD['FSP.1'] = pd.to_numeric(df_FD['FSP.1'], errors='coerce').astype('Int32')
df_FD['AGE.2'] = pd.to_numeric(df_FD['AGE.2'], errors='coerce').astype('Int32')
df_FD['FSP.2'] = pd.to_numeric(df_FD['FSP.2'], errors='coerce').astype('Int32')
df_FD['AGE.3'] = pd.to_numeric(df_FD['AGE.3'], errors='coerce').astype('Int32')
df_FD['FSP.3'] = pd.to_numeric(df_FD['FSP.3'], errors='coerce').astype('Int32')

# rename keys with '.' for ease of use within pandas
rename_FD = {
    'FL_WET.x': 'FL_WET', 
    'FL_FROZEN.x': 'FL_FROZEN',
    'FL_STD.x': 'FL_STD', 
    'MATURITY.x': 'MATURITY',
    'Ager.1': 'Ager_1',
    'AGE.1': 'AGE_1',
    'FSP.1': 'FSP_1',
    'Comments.1': 'Comments_1',
    'Ager.2': 'Ager_2',
    'AGE.2': 'AGE_2',
    'FSP.2': 'FSP_2',
    'Comments.2': 'Comments_2',
    'Ager.3': 'Ager_3',
    'AGE.3': 'AGE_3',
    'FSP.3': 'FSP_3',
    'Comments.3': 'Comments_3'
}
df_FD = df_FD.drop('Unnamed: 0', axis=1).rename(columns=rename_FD)

In [4]:
# create a DATETIME column
df_FD['DATETIME'] = pd.to_datetime(df_FD['YEAR'].astype(str) + '-' + df_FD['MM'].astype(str) + '-' + df_FD['DD'].astype(str))

# move the DATETIME column to the front of the dataframe
df_FD = df_FD[['DATETIME'] + [col for col in df_FD.columns if col != 'DATETIME']]

# Descriptive Statistics

In [7]:
df_FD.dtypes

DATETIME            datetime64[ns]
YEAR                         Int32
MM                           Int32
DD                           Int32
WEEK                         Int32
SITE                        object
PERIOD                      object
CONDITION                   object
FISH_NO                      Int32
FL_WET                       Int32
FL_FROZEN                    Int32
FL_STD                     float64
WEIGHT                     float64
SPECIES                     object
SEX                         object
MATURITY                     Int32
GONAD_WEIGHT               float64
Ager_1                      object
AGE_1                        Int32
FSP_1                        Int32
Comments_1                  object
Ager_2                      object
AGE_2                        Int32
FSP_2                        Int32
Comments_2                  object
Ager_3                      object
AGE_3                        Int32
FSP_3                        Int32
Comments_3          

In [26]:
# most datatypes look good, FL_STD could be an int: could round and convert
# left as is for now
# these look like conversion issues, 4 decimals after mm is unrealistic
df_FD['FL_STD'].unique()

array([288.    , 251.    , 247.    , 287.    , 264.    , 270.    ,
       268.    , 255.    , 274.    , 280.    , 277.    , 279.    ,
       253.    , 299.    , 293.    , 285.    , 269.    , 282.    ,
       259.    , 265.    , 275.    , 309.    , 271.    , 276.    ,
       290.    , 278.    , 305.    , 248.    , 252.    , 272.    ,
       308.    , 258.    , 257.    , 283.    , 289.    , 273.    ,
       294.    , 260.    , 263.    , 266.    , 254.    , 286.    ,
       244.    , 281.    , 256.    , 261.    , 250.    , 295.    ,
       245.    , 246.    , 262.    , 249.    , 240.    , 233.    ,
       291.    , 238.    , 235.    , 243.    , 242.    , 232.    ,
       239.    , 241.    , 237.    , 227.    , 267.    , 236.    ,
       222.    , 297.    , 292.    , 302.    , 314.    , 298.    ,
       307.    , 303.    , 315.    , 312.    , 296.    , 300.    ,
       325.    , 331.    , 310.    , 318.    , 304.    , 230.    ,
       234.    , 231.    , 301.    , 319.    , 324.    , 228. 

In [19]:
df_FD.describe(include='all', datetime_is_numeric=False).T

  df_FD.describe(include='all', datetime_is_numeric=False).T


Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
DATETIME,36912.0,783.0,1996-06-05 00:00:00,158.0,1983-05-09,2021-06-30,,,,,,,
YEAR,36912.0,,,,NaT,NaT,2001.6,11.43,1983.0,1990.0,2000.0,2012.0,2021.0
MM,36912.0,,,,NaT,NaT,5.403,0.502,4.0,5.0,5.0,6.0,7.0
DD,36912.0,,,,NaT,NaT,16.133,8.764,1.0,9.0,16.0,24.0,31.0
WEEK,23664.0,,,,NaT,NaT,5.141,2.027,1.0,4.0,5.0,7.0,10.0
SITE,36794.0,,,,NaT,NaT,27.553,18.076,1.0,12.0,26.0,41.0,95.0
PERIOD,31710.0,4.0,AM,16927.0,NaT,NaT,,,,,,,
CONDITION,36717.0,4.0,Frozen,25118.0,NaT,NaT,,,,,,,
FISH_NO,36912.0,,,,NaT,NaT,17.756,11.62,1.0,9.0,17.0,25.0,100.0
FL_WET,10853.0,,,,NaT,NaT,251.729,20.377,1.0,238.0,251.0,265.0,358.0


In [9]:
df_FD.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
YEAR,36912.0,2001.6,11.43,1983.0,1990.0,2000.0,2012.0,2021.0
MM,36912.0,5.403,0.502,4.0,5.0,5.0,6.0,7.0
DD,36912.0,16.133,8.764,1.0,9.0,16.0,24.0,31.0
WEEK,23664.0,5.141,2.027,1.0,4.0,5.0,7.0,10.0
FISH_NO,36912.0,17.756,11.62,1.0,9.0,17.0,25.0,100.0
FL_WET,10853.0,251.729,20.377,1.0,238.0,251.0,265.0,358.0
FL_FROZEN,25595.0,247.218,19.866,1.0,234.0,247.0,260.0,366.0
FL_STD,35245.0,253.169,27.956,20.0,241.0,254.0,268.0,376.0
WEIGHT,36610.0,216.618,61.39,15.1,173.0,210.0,254.0,1934.0
MATURITY,36286.0,3.772,0.727,0.0,4.0,4.0,4.0,44.0


# check all of the fields

### Clean, Backup, Remap, and Take 1st Entry for Site

In [10]:
# clean the site column before investigating in detail

# backup exact site stuff in notes
df_FD['SITE_notes'] = df_FD['SITE'].astype('str')

# Expand and Clean Site Column
df_FD[['SITE1', 'SITE2', 'SITE3']] = df_FD.SITE.astype('str').str.replace(' ','').str.split(',', expand=True).fillna(pd.NA)

# these are the non-numeric entries to be remapped
[x for x in list(df_FD.SITE1.unique()) if not x.isnumeric()]

['nan',
 '1A',
 '47or62',
 '11K3(435',
 'E.MACFARLANE',
 'EricMacFarlane',
 'JMcFarlane',
 'JACoady',
 'EricMac']

In [11]:
# remap sites to numeric, overwrite SITE (after backing up notes), convert to integer

# these numbers are important/exact, but will be undone on import
# these are consistent with older analysis, and import script into dm_apps
remap_sites = {
    '1A': 90,
    '1B': 91,
    '47or62': 47,
    'EricMcFarlane': 92,
    'E.MACFARLANE': 92,
    'EricMacFarlane': 92,
    'EricMac': 92,
    'JimmyMacFarlane': 93,
    'JMcFarlane': 93,
    'JohnAlbertCoady': 94,
    'JACoady': 94,
    '11K3(435': 95
}

df_FD['SITE'] = df_FD.replace({'SITE1': remap_sites}).SITE1
df_FD['SITE'] = pd.to_numeric(df_FD['SITE'], errors='coerce').astype('Int32')

### helper functions for EDA

In [20]:
def types_of_data(series):
    
    numbers, strings, blanks, n_null = 0, 0, 0, 0
    
    for item in series:
        try:
            float(item)
            if pd.isnull(item):
                n_null += 1
            else:
                numbers += 1
        except ValueError:
            strings += 1
            if str(item) == ' ' or str(item) == '':
                blanks += 1
                
    # note: blanks are strings
    return numbers, strings, blanks, n_null


def print_data_info(dataframe, column):
    
    print('\n--------------------------------------------------\nData Info:', column.upper(), '\n')
    
    numbers, strings, blanks, n_null = types_of_data(dataframe[column])
    
    print('UNIQUE numbers, strings, blanks, null =', types_of_data(dataframe[column].unique()))
    print('number of numeric =', numbers)
    print('number of strings =', strings)
    print('number of blanks =', blanks)  # blanks are strings
    print('number of null = ', n_null)
    print('check: ', numbers + strings + n_null, '==', dataframe[column].shape[0])

    
def print_hist(dataframe, column, max_bins=100):
    
    print('\n--------------------------------------------------\nHistogram:', column.upper())
    
    plt.figure(figsize=(14, 4))
    
    n_unique = dataframe[column].nunique()
    if n_unique <= max_bins:
        bins = n_unique
    else:
        bins = max_bins
    
    dataframe[column].hist(bins=bins)
    
    plt.show()
    
    
def print_timeseries(dataframe, column, time_column, colours=(0,1)):
    
    print('\n------------------------------------------------\nTime Series:', column.upper(), sep='')
    
    plt.figure(figsize=(14, 4))
    
    # could make the categorical check an input variable, but this just works (albiet super clean)
    try:
        plot = sns.lineplot(
            x = dataframe[time_column].dt.year, 
            y = dataframe[column].astype(float),  # cast int to float or get ValueErrors with IQR (pi 50)
            errorbar = ('pi', 50),  # IQR
            color = sns.color_palette()[colours[0]]
        )
        print('NUMERICAL')  # NOTE: some are actually categorical but stored as an int(eg, site), those averages don't have meaning
    except ValueError:  # categorical variables, can't cast to float
        plot = sns.lineplot(
            x = dataframe[time_column].dt.year, 
            y = dataframe[column],
            errorbar = ('pi', 50), 
            color = sns.color_palette()[colours[1]]
        )
        print('CATEGORICAL')  # NOTE: categorical variables treated as ordinal to make plotting possible, averages don't have meaning
       
    
    # show entire dataset time period for all plots
    plot.set_xlim(1982, 2022)
    
    plt.show()

In [24]:
columns_FD = list(df_FD.columns)
[x for x in columns_FD if x not in ['DATETIME', 'YEAR', 'MM', 'DD', 'WEEK',  'SITE_notes', 'SITE1', 'SITE2', 'SITE3']]

['SITE',
 'PERIOD',
 'CONDITION',
 'FISH_NO',
 'FL_WET',
 'FL_FROZEN',
 'FL_STD',
 'WEIGHT',
 'SPECIES',
 'SEX',
 'MATURITY',
 'GONAD_WEIGHT',
 'Ager_1',
 'AGE_1',
 'FSP_1',
 'Comments_1',
 'Ager_2',
 'AGE_2',
 'FSP_2',
 'Comments_2',
 'Ager_3',
 'AGE_3',
 'FSP_3',
 'Comments_3',
 'Envelop.Comments',
 'AGE_notes_1',
 'FSP_notes_1',
 'AGE_notes_2',
 'FSP_notes_2',
 'AGE_notes_3',
 'FSP_notes_3']

In [22]:
columns_FD = [
    'DATETIME', 'YEAR', 'MM', 'DD', 'WEEK', 'SITE', 'PERIOD', 'CONDITION',
    'FISH_NO', 'FL_WET', 'FL_FROZEN', 'FL_STD', 'WEIGHT', 'SPECIES', 'SEX',
    'MATURITY', 'GONAD_WEIGHT', 'Ager_1', 'AGE_1', 'FSP_1', 'Comments_1',
    'Ager_2', 'AGE_2', 'FSP_2', 'Comments_2', 'Ager_3', 'AGE_3', 'FSP_3',
    'Comments_3', 'Envelop.Comments', 'AGE_notes_1', 'FSP_notes_1',
    'AGE_notes_2', 'FSP_notes_2', 'AGE_notes_3', 'FSP_notes_3',
    'SITE_notes', 'SITE1', 'SITE2', 'SITE3'
]


In [46]:
col = 'SITE'

print_data_info(df_FD, col)
print_hist(df_FD, col)
print_timeseries(df_FD, col, 'DATETIME')

array([12, 46, 9, 25, 33, 2, 49, 35, 17, 39, 38, 5, 8, 37, 7, 51, 11, 64,
       56, nan, 15, 48, 34, 4, 23, 67, 1, 26, 28, 6, 52, 29, '1A', 60,
       '60, 52', 41, '1, 8', '37,38,35', '60,52', '1,8', '5,8', '12,17',
       '15,17', '33,48,49', '11,12', '5,11,17', '37,60', '52,60', '38,52',
       '1,8,26', '33,41,60', '35,60', 27, '35,37', '41,49', '52,60,38',
       '60,52,35', '49,41', '35,52', '41,35', '01,02', '60,41', '35,41',
       '38,60,52', '12,26', '2,5', '5,17', '5,26', '25,26', 47, 14, 30,
       62, '47 or 62', '11K3 (435,211)', 'E.MACFARLANE',
       'Eric MacFarlane', 'J McFarlane', 'JA Coady', 'Eric Mac'],
      dtype=object)