In [30]:
import os.path
import shutil

import numpy as np
import pandas as pd

df = pd.read_csv("data resources/sGSL-scallop-RV-FGP.csv", encoding="windows-1252")
df.columns

Index(['year__annee', 'month__mois', 'day__jour',
       'start_hour__heure_de_depart', 'start_minute__minute_de_depart',
       'latitude', 'longitude', 'gear__equipement', 'species__espece',
       'french_name__nom_francais', 'english_name__nom_anglais',
       'latin_name__nom_latin', 'weight_caught__poids_pris',
       'number_caught__quantite__attrape'],
      dtype='object')

In [31]:
# define the function to combine datetime from a row
def combine_date(row):
    year = str(row["year__annee"]).zfill(4)
    month = str(row["month__mois"]).zfill(2)
    day = str(row["day__jour"]).zfill(2)
    hour = row["start_hour__heure_de_depart"]
    minute = row["start_minute__minute_de_depart"]
    if (not isinstance(hour, str) and np.isnan(hour)) or (not isinstance(minute, str) and np.isnan(minute)):
        return f"{year}-{month}-{day}"
    else:
        return f"{year}-{month}-{day} {int(hour)}:{int(minute)}"

In [32]:
# add a new column that is a datetime
df["datetime"] = pd.to_datetime(df.apply(combine_date, axis=1))
df_trunc = df.sort_values("datetime").loc[:, ["year__annee", "latitude", "longitude", "datetime"]]
df_trunc = df_trunc.drop_duplicates()
df_trunc

Unnamed: 0,year__annee,latitude,longitude,datetime
0,2012,46.249083,-64.331250,2012-06-27 13:57:00
21,2012,46.335917,-64.325500,2012-06-28 07:59:00
36,2012,46.324417,-64.128167,2012-06-28 09:28:00
47,2012,46.308333,-64.108667,2012-06-28 10:14:00
55,2012,46.280917,-64.099417,2012-06-28 11:15:00
...,...,...,...,...
7021,2020,46.563167,-64.511583,2020-10-23 10:23:00
7024,2020,46.548583,-64.469500,2020-10-23 10:49:00
7030,2020,46.534667,-64.415917,2020-10-23 11:21:00
7039,2020,46.519833,-64.406917,2020-10-23 11:41:00


In [33]:
# Seems like there are some bad points. Let's have a closer look
mask = (df_trunc["latitude"] > 50) | (df_trunc["longitude"] > -60)
df_trunc[mask]


Unnamed: 0,year__annee,latitude,longitude,datetime
6288,2020,46.542167,-34.375167,2020-10-15 10:03:00
6472,2020,46.204,-35.154917,2020-10-17 08:12:00
6966,2020,55.61275,-64.560833,2020-10-22 15:47:00


In [34]:
# let's remove those rows
df_trunc = df_trunc[~mask]
df_trunc

Unnamed: 0,year__annee,latitude,longitude,datetime
0,2012,46.249083,-64.331250,2012-06-27 13:57:00
21,2012,46.335917,-64.325500,2012-06-28 07:59:00
36,2012,46.324417,-64.128167,2012-06-28 09:28:00
47,2012,46.308333,-64.108667,2012-06-28 10:14:00
55,2012,46.280917,-64.099417,2012-06-28 11:15:00
...,...,...,...,...
7021,2020,46.563167,-64.511583,2020-10-23 10:23:00
7024,2020,46.548583,-64.469500,2020-10-23 10:49:00
7030,2020,46.534667,-64.415917,2020-10-23 11:21:00
7039,2020,46.519833,-64.406917,2020-10-23 11:41:00


In [35]:
from stock_assessment_surveys.utils.xy2line import xy2line
xy2line(df_trunc)