# Process the raw data from pyapm and bpnsdata
This notebook is the code to process the output given after processing the data with pypam and bpnsdata
For more information about this process please contact clea.parcerisas@vliz.be or check the documentation of both packages
https://lifewatch-pypam.readthedocs.io/en/latest/
https://github.com/lifewatch/bpnsdata

In [4]:
import pathlib

import geopandas
import numpy as np
import pandas as pd
import xarray
from tqdm import tqdm

In [5]:
# Don't forget to add the ones you added before
ENV_LABELS = [
    'route_density',
    'season',
    'moon_phase',
    'day_moment',
    'sediment_type',
    'bathymetry',
    'other_var',
    'new_var'
]

CHUNK_LENGTH = 60.0 # This has to be the SAME one you chose in 0_create_dataset (binsize)

# Set the min and max frequencies to use (has to be a range smaller or equal to the one you selected in 0_create_dataset)
MAX_FREQ = 24000
MIN_FREQ = 60

In [6]:
# Write down which of the variables are CATEGORIES and not NUMERICAL
CATEGORICAL_VARS = ['day_moment', 'sediment_type', 'seabed_habitat', 'deployment_name']
CYCLIC_VARS = ['season', 'moon_phase']

vars_dtypes = {
    'route_density': int,
    'season': int,
    'moon_phase': np.float16,
    'day_moment': 'category',
    'sediment_type': 'category',
    'bathymetry': np.float16,
    'other_var': str,
    'new_var': str,
    'deployment_name': 'category'
}


In [7]:
# Define the folders
data_path = pathlib.Path('./data/raw_data/')
processed_data_path = pathlib.Path('./data/processed/')
raw_data_path = pathlib.Path('./data/raw_data/deployments/deployments')

In [8]:
# Read the metadata csv
metadata = pd.read_csv(data_path.joinpath('data_summary_mda.csv'))

# Create the empty output vars
df_features = pd.DataFrame()
df_sample = pd.DataFrame()
df_env = pd.DataFrame()
df_geo = geopandas.GeoDataFrame()
df_labels = pd.DataFrame()

# Define the names of the vars that will be used
# HERE IMPORTANT TO DECIDE IF YOU PROCESS 'oct3' or 'millidecade_bands'
features_var = 'oct3'
deployment_columns = ['deployment_name', 'datetime']
freqticks = None

In [11]:
# Join all the deployments in one DataFrame
df = pd.DataFrame()
total_acoustic_time = 0
for idx in tqdm(metadata.index, total=len(metadata)):
    deployment_row = metadata.loc[idx]
    env_name = '%s_%s_env.nc' % (idx, deployment_row.deployment_name)
    env_path = processed_data_path.joinpath(env_name)
    deployment_file_name = '%s_%s.nc' % (idx, deployment_row.deployment_name)
    name = deployment_row['deployment_name']
    deployment = xarray.open_dataset(env_path)

    # Get the geometry
    geo_series = geopandas.GeoSeries(data=geopandas.points_from_xy(x=deployment['longitude'],
                                                                   y=deployment['latitude']),
                                     crs='EPSG:4326')

    # Eliminate the frequencies below MIN_FREQ and above MAX_FREQ
    deployment = deployment.sel(frequency=deployment.frequency[deployment.frequency < MAX_FREQ])
    deployment = deployment.sel(frequency=deployment.frequency[deployment.frequency > MIN_FREQ])
    deployment_duration = deployment.datetime.max() - deployment.datetime.min()
    total_acoustic_time += deployment_duration
    deployment = deployment[ENV_LABELS + [features_var] + deployment_columns].dropna('id', 'any')
    clean_freqticks = deployment.frequency.values

    # Create a pandas df with all the wanted values
    values_arr = deployment[features_var].values
    df_deployment = pd.DataFrame(values_arr)
    df_deployment = df_deployment.astype(np.float16)
    for env in ENV_LABELS + deployment_columns:
        df_deployment[env] = deployment[env].values

    df_deployment = geopandas.GeoDataFrame(df_deployment, geometry=geo_series)

    df = pd.concat([df, df_deployment], ignore_index=True)
    deployment.close()

# print the total acoustic time
print('Total amount of time recorded %s h' % (total_acoustic_time.values / np.timedelta64(1, 'h')))

100%|██████████| 4/4 [00:00<00:00, 27.69it/s]

Total amount of time recorded 896.3058333333333 h





## Some data clean up

In [12]:
# Change the data types to save some computational power and memory
# Some operations
df = df.replace(['Civil twilight', 'Astronomical twilight', 'Nautical twilight'], ['Twilight', 'Twilight', 'Twilight'])
df['bathymetry'] = -1 * df['bathymetry']

# Categorical vars to category for efficient storage and processing
for env, env_type in vars_dtypes.items():
    df[env] = df[env].astype(env_type)

## Save the outputs to work on with the next script

In [13]:
# Filter the deployments to skip if there were any
np.save(processed_data_path.joinpath('used_freqticks.npy'), clean_freqticks)
df.to_pickle(processed_data_path.joinpath('df_complete.pkl'))

In [14]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,season,moon_phase,day_moment,sediment_type,bathymetry,other_var,new_var,deployment_name,datetime,geometry
0,83.2500,88.9375,82.9375,81.3125,81.3750,79.3750,73.6875,75.4375,74.3125,71.9375,...,21,6.175781,Twilight,reef,2.482422,1,a,NLOyster1,2022-05-29 22:00:09,POINT (4.88076 53.06843)
1,83.2500,88.8750,83.3125,81.4375,81.3750,79.4375,73.8125,75.6875,74.5000,72.4375,...,21,6.175781,Twilight,reef,2.482422,1,a,NLOyster1,2022-05-29 22:00:34,POINT (4.88076 53.06843)
2,82.9375,88.9375,83.5625,81.3125,81.1875,79.5625,73.8125,75.6250,74.8125,73.1250,...,21,6.175781,Twilight,reef,2.482422,1,a,NLOyster1,2022-05-29 22:00:59,POINT (4.88076 53.06843)
3,83.3750,88.8750,83.2500,81.1250,80.9375,79.1875,73.5000,75.1250,74.4375,72.1250,...,21,6.175781,Twilight,reef,2.482422,1,a,NLOyster1,2022-05-29 22:01:24,POINT (4.88076 53.06843)
4,83.3750,89.1250,83.1250,81.5625,81.6875,79.6875,73.5625,75.4375,74.8125,72.5625,...,21,6.175781,Twilight,reef,2.482422,1,a,NLOyster1,2022-05-29 22:01:49,POINT (4.88076 53.06843)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,81.0625,80.5625,80.2500,78.1875,77.7500,74.7500,72.9375,71.3125,70.1875,73.5625,...,23,1.766602,Day,sand,1.554688,2,a,NLSand2,2022-06-08 13:27:50,POINT (5.20231 53.06843)
936,81.3125,80.4375,80.0625,78.0000,77.6250,74.6875,72.6875,71.1250,69.8125,69.6250,...,23,1.766602,Day,sand,1.554688,2,a,NLSand2,2022-06-08 13:28:15,POINT (5.20231 53.06843)
937,79.8125,79.4375,80.0000,78.1250,78.4375,74.8750,72.6875,71.3125,71.5625,73.8125,...,23,1.766602,Day,sand,1.554688,2,a,NLSand2,2022-06-08 13:28:40,POINT (5.20231 53.06843)
938,81.5625,83.5625,82.1875,80.0625,82.6250,79.1250,76.2500,77.0000,75.0000,74.7500,...,23,1.766602,Day,sand,1.554688,2,a,NLSand2,2022-06-08 13:29:05,POINT (5.20231 53.06843)
