# import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# import and clean data

In [2]:
dtypes = {
    'YEAR': 'Int32', 
    'MM': 'Int32', 
    'DD': 'Int32', 
    'WEEK': 'Int32', 
    'SITE': 'object', 
    'PERIOD': 'object', 
    'CONDITION': 'object', 
    'FISH_NO': 'Int32', 
    'FL_WET.x': 'object',  # some 'NA', convert to numeric, coerce error to make 'NA' actually null
    'FL_FROZEN.x': 'object',  # some 'NA', convert to numeric, coerce error to make 'NA' actually null
    'FL_STD.x': 'float64',
    'WEIGHT': 'float64',
    'SPECIES': 'object', 
    'SEX': 'object', 
    'MATURITY.x': 'object',  # two entries that at 3?, (also 1 that is 44), convert to numeric, coerce errors
    'GONAD_WEIGHT': 'object',  # some '.' / 'NA', convert to numeric, coerce error to make 'NA' actually null
    'Ager.1': 'object',
    'AGE.1': 'object',  # convert to numeric and separate text comments
    'FSP.1': 'object',  # convert to numeric and separate text comments
    'Comments.1': 'object', 
    'Ager.2': 'object',  
    'AGE.2': 'object',  # convert to numeric and separate text comments
    'FSP.2': 'object',  # convert to numeric and separate text comments
    'Comments.2': 'object', 
    'Ager.3': 'object', 
    'AGE.3': 'object',  # convert to numeric and separate text comments
    'FSP.3': 'object',  # convert to numeric and separate text comments
    'Comments.3': 'object', 
    'Envelop Comments': 'object'
} 

df_FD = pd.read_excel('Gaspereau Ages_Combined.xlsx', dtype=dtypes)

In [3]:
# create backup of notes (including all numeric data for simplicity)
df_FD['AGE_notes_1'] = df_FD['AGE.1']
df_FD['FSP_notes_1'] = df_FD['FSP.1']
df_FD['AGE_notes_2'] = df_FD['AGE.2']
df_FD['FSP_notes_2'] = df_FD['FSP.2']
df_FD['AGE_notes_3'] = df_FD['AGE.3']
df_FD['FSP_notes_3'] = df_FD['FSP.3']

# convert to numeric and coerce errors
# for FL_WET.x, one measurement is a float (258.8), all else are int, need to round to convert to int
df_FD['FL_WET.x'] = np.round(pd.to_numeric(df_FD['FL_WET.x'], errors='coerce')).astype('Int32')
df_FD['FL_FROZEN.x'] = pd.to_numeric(df_FD['FL_FROZEN.x'], errors='coerce').astype('Int32')
df_FD['MATURITY.x'] = pd.to_numeric(df_FD['MATURITY.x'], errors='coerce').astype('Int32')
df_FD['GONAD_WEIGHT'] = pd.to_numeric(df_FD['GONAD_WEIGHT'], errors='coerce')  # float
df_FD['AGE.1'] = pd.to_numeric(df_FD['AGE.1'], errors='coerce').astype('Int32')
df_FD['FSP.1'] = pd.to_numeric(df_FD['FSP.1'], errors='coerce').astype('Int32')
df_FD['AGE.2'] = pd.to_numeric(df_FD['AGE.2'], errors='coerce').astype('Int32')
df_FD['FSP.2'] = pd.to_numeric(df_FD['FSP.2'], errors='coerce').astype('Int32')
df_FD['AGE.3'] = pd.to_numeric(df_FD['AGE.3'], errors='coerce').astype('Int32')
df_FD['FSP.3'] = pd.to_numeric(df_FD['FSP.3'], errors='coerce').astype('Int32')

# rename keys with '.' for ease of use within pandas
rename_FD = {
    'FL_WET.x': 'FL_WET', 
    'FL_FROZEN.x': 'FL_FROZEN',
    'FL_STD.x': 'FL_STD', 
    'MATURITY.x': 'MATURITY',
    'Ager.1': 'Ager_1',
    'AGE.1': 'AGE_1',
    'FSP.1': 'FSP_1',
    'Comments.1': 'Comments_1',
    'Ager.2': 'Ager_2',
    'AGE.2': 'AGE_2',
    'FSP.2': 'FSP_2',
    'Comments.2': 'Comments_2',
    'Ager.3': 'Ager_3',
    'AGE.3': 'AGE_3',
    'FSP.3': 'FSP_3',
    'Comments.3': 'Comments_3'
}
df_FD = df_FD.drop('Unnamed: 0', axis=1).rename(columns=rename_FD)

In [16]:
# create a DATETIME column
df_FD['DATETIME'] = pd.to_datetime(df_FD['YEAR'].astype(str) + '-' + df_FD['MM'].astype(str) + '-' + df_FD['DD'].astype(str))

# move the DATETIME column to the front of the dataframe
df_FD = df_FD[['DATETIME'] + [col for col in df_FD.columns if col != 'DATETIME']]

In [17]:
# check a random sample of the data
df_FD.sample(10)

Unnamed: 0,DATETIME,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3
22553,2007-05-25,2007,5,25,4.0,25,AM,Frozen,24,,253.0,261.0,237.2,A,F,4,29.1,,3.0,3.0,,,,,,,,,,,3.0,3.0,,,,
409,1983-05-21,1983,5,21,,12,,Fresh,87,240.0,,240.0,198.0,A,M,2,,,3.0,3.0,,,,,,,,,,,3.0,3.0,,,,
21227,2003-06-26,2003,6,26,9.0,5,AM,Fresh,26,252.0,,252.0,205.6,B,F,4,28.8,,,0.0,,,,,,,,,,,,0.0,,,,
13255,1995-05-27,1995,5,27,4.0,33,PM,Frozen,22,,281.0,289.0,338.0,A,F,4,62.6,,,,,,,,,,,,,,,,,,,
4292,1988-05-15,1988,5,15,,12,,Fresh,16,265.0,,265.0,293.0,A,F,4,46.8,,5.0,4.0,,,,,,,,,,,5.0,4.0,,,,
29943,2014-06-27,2014,6,27,9.0,12,PM,Frozen,21,,248.0,256.0,220.1,B,F,4,34.6,JM,5.0,4.0,,LF,5.0,4.0,,,,,,,5.0,4.0,5.0,4.0,,
7671,1990-05-14,1990,5,14,,23,AM,Fresh,23,283.0,,283.0,347.0,A,F,4,65.4,,5.0,4.0,,,,,,,,,,,5.0,4.0,,,,
7313,1989-06-08,1989,6,8,,64,PM,Frozen,1,,230.0,238.0,138.0,B,M,4,,,4.0,4.0,,,,,,,,,,,4.0,4.0,,,,
5028,1989-05-10,1989,5,10,,9,AM,Fresh,18,255.0,,255.0,242.0,A,M,4,,,4.0,3.0,,,,,,,,,,,4.0,3.0,,,,
3367,1986-06-05,1986,6,5,,12,,Frozen,15,,228.0,236.0,171.0,A,M,2,,,3.0,3.0,,,,,,,,,,,3.0,3.0,,,,


In [5]:
df_FD.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
YEAR,36912.0,,,,2001.6,11.43,1983.0,1990.0,2000.0,2012.0,2021.0
MM,36912.0,,,,5.403,0.502,4.0,5.0,5.0,6.0,7.0
DD,36912.0,,,,16.133,8.764,1.0,9.0,16.0,24.0,31.0
WEEK,23664.0,,,,5.141,2.027,1.0,4.0,5.0,7.0,10.0
SITE,36794.0,79.0,12.000,6932.0,,,,,,,
PERIOD,31710.0,4.0,AM,16927.0,,,,,,,
CONDITION,36717.0,4.0,Frozen,25118.0,,,,,,,
FISH_NO,36912.0,,,,17.756,11.62,1.0,9.0,17.0,25.0,100.0
FL_WET,10853.0,,,,251.729,20.377,1.0,238.0,251.0,265.0,358.0
FL_FROZEN,25595.0,,,,247.218,19.866,1.0,234.0,247.0,260.0,366.0


# Descriptive Statistics

In [6]:
df_FD.dtypes

YEAR                  Int32
MM                    Int32
DD                    Int32
WEEK                  Int32
SITE                 object
PERIOD               object
CONDITION            object
FISH_NO               Int32
FL_WET                Int32
FL_FROZEN             Int32
FL_STD              float64
WEIGHT              float64
SPECIES              object
SEX                  object
MATURITY              Int32
GONAD_WEIGHT        float64
Ager_1               object
AGE_1                 Int32
FSP_1                 Int32
Comments_1           object
Ager_2               object
AGE_2                 Int32
FSP_2                 Int32
Comments_2           object
Ager_3               object
AGE_3                 Int32
FSP_3                 Int32
Comments_3           object
Envelop.Comments     object
AGE_notes_1          object
FSP_notes_1          object
AGE_notes_2          object
FSP_notes_2          object
AGE_notes_3          object
FSP_notes_3          object
dtype: object

In [7]:
df_FD.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
YEAR,36912.0,,,,2001.6,11.43,1983.0,1990.0,2000.0,2012.0,2021.0
MM,36912.0,,,,5.403,0.502,4.0,5.0,5.0,6.0,7.0
DD,36912.0,,,,16.133,8.764,1.0,9.0,16.0,24.0,31.0
WEEK,23664.0,,,,5.141,2.027,1.0,4.0,5.0,7.0,10.0
SITE,36794.0,79.0,12.000,6932.0,,,,,,,
PERIOD,31710.0,4.0,AM,16927.0,,,,,,,
CONDITION,36717.0,4.0,Frozen,25118.0,,,,,,,
FISH_NO,36912.0,,,,17.756,11.62,1.0,9.0,17.0,25.0,100.0
FL_WET,10853.0,,,,251.729,20.377,1.0,238.0,251.0,265.0,358.0
FL_FROZEN,25595.0,,,,247.218,19.866,1.0,234.0,247.0,260.0,366.0


In [8]:
df_FD.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
YEAR,36912.0,2001.6,11.43,1983.0,1990.0,2000.0,2012.0,2021.0
MM,36912.0,5.403,0.502,4.0,5.0,5.0,6.0,7.0
DD,36912.0,16.133,8.764,1.0,9.0,16.0,24.0,31.0
WEEK,23664.0,5.141,2.027,1.0,4.0,5.0,7.0,10.0
FISH_NO,36912.0,17.756,11.62,1.0,9.0,17.0,25.0,100.0
FL_WET,10853.0,251.729,20.377,1.0,238.0,251.0,265.0,358.0
FL_FROZEN,25595.0,247.218,19.866,1.0,234.0,247.0,260.0,366.0
FL_STD,35245.0,253.169,27.956,20.0,241.0,254.0,268.0,376.0
WEIGHT,36610.0,216.618,61.39,15.1,173.0,210.0,254.0,1934.0
MATURITY,36286.0,3.772,0.727,0.0,4.0,4.0,4.0,44.0


# check all of the fields

In [12]:
def types_of_data(series):
    
    numbers, strings, blanks, n_null = 0, 0, 0, 0
    
    for item in series:
        try:
            float(item)
            if pd.isnull(item):
                n_null += 1
            else:
                numbers += 1
        except ValueError:
            strings += 1
            if str(item) == ' ' or str(item) == '':
                blanks += 1
                
    # note: blanks are strings
    return numbers, strings, blanks, n_null


def print_data_info(dataframe, column):
    
    numbers, strings, blanks, n_null = types_of_data(dataframe[column])
    
    print('UNIQUE numbers, strings, blanks, null =', types_of_data(series.unique()))
    print('number of numeric =', numbers)
    print('number of strings =', strings)
    print('number of blanks =', blanks)  # blanks are strings
    print('number of null = ', n_null)
    print('check: ', numbers + strings + n_null, '==', series.shape[0])

    
def print_hist(dataframe, column, max_bins=100):
    
    print('--------------------------------------------------\nHistogram:', column.upper())
    
    plt.figure(figsize=(14, 4))
    
    n_unique = dataframe[column].nunique()
    if n_unique <= max_bins:
        bins = n_unique
    else:
        bins = max_bins
    
    dataframe[column].hist(bins=bins)
    
    plt.show()
    
    
def print_timeseries(dataframe, column, time_column, colours=(0,1)):
    
    print('\n------------------------------------------------\n', feature.upper(), sep='')
    
    plt.figure(figsize=(14, 4))
    
    # could make the categorical check an input variable, but this just works (albiet super clean)
    try:
        plot = sns.lineplot(
            x = dataframe[time_column].dt.year, 
            y = dataframe[column].astype(float),  # cast int to float or get ValueErrors with IQR (pi 50)
            errorbar = ('pi', 50),  # IQR
            color = sns.color_palette()[colours[0]]
        )
        print('NUMERICAL')  # NOTE: some are actually categorical but stored as an int(eg, site), those averages don't have meaning
    except ValueError:  # categorical variables, can't cast to float
        plot = sns.lineplot(
            x = dataframe[time_column].dt.year, 
            y = dataframe[column],
            errorbar = ('pi', 50), 
            color = sns.color_palette()[colours[1]]
        )
        print('CATEGORICAL')  # NOTE: categorical variables treated as ordinal to make plotting possible, averages don't have meaning
       
    
    # show entire dataset time period for all plots
    plot.set_xlim(1982, 2022)
    
    plt.show()

In [11]:
columns_FD = [
    'YEAR', 'MM', 'DD', 'WEEK', 'SITE', 'PERIOD', 'CONDITION', 'FISH_NO',
    'FL_WET', 'FL_FROZEN', 'FL_STD', 'WEIGHT', 'SPECIES', 'SEX', 'MATURITY',
    'GONAD_WEIGHT', 'Ager_1', 'AGE_1', 'FSP_1', 'Comments_1', 'Ager_2',
    'AGE_2', 'FSP_2', 'Comments_2', 'Ager_3', 'AGE_3', 'FSP_3',
    'Comments_3', 'Envelop.Comments', 'AGE_notes_1', 'FSP_notes_1',
    'AGE_notes_2', 'FSP_notes_2', 'AGE_notes_3', 'FSP_notes_3'
]

column = 
time_column = 

print_data_info(dataframe, column)
print_hist(dataframe, column)
print_timeseries(dataframe, column, time_column)

Index(['YEAR', 'MM', 'DD', 'WEEK', 'SITE', 'PERIOD', 'CONDITION', 'FISH_NO',
       'FL_WET', 'FL_FROZEN', 'FL_STD', 'WEIGHT', 'SPECIES', 'SEX', 'MATURITY',
       'GONAD_WEIGHT', 'Ager_1', 'AGE_1', 'FSP_1', 'Comments_1', 'Ager_2',
       'AGE_2', 'FSP_2', 'Comments_2', 'Ager_3', 'AGE_3', 'FSP_3',
       'Comments_3', 'Envelop.Comments', 'AGE_notes_1', 'FSP_notes_1',
       'AGE_notes_2', 'FSP_notes_2', 'AGE_notes_3', 'FSP_notes_3'],
      dtype='object')