In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import requests
import os
import glob
#import re
#import sys
import datetime
from datetime import timedelta
from io import StringIO
#from bs4 import BeautifulSoup




In [9]:
# Local import 
# > Make sure SIO_wrap dir is on the same path as this script.
%load_ext autoreload
%autoreload 2
from SIO_wrap import dir_tree, fnames

from setdir import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
def load_one_drifter(PID, base_url, username, password, 
                     download_start_date): 
    
    from SIO_wrap import fnames
    
    tstr = f'start_date={download_start_date}'
    pidstr = '&platform_id='

    full_url = base_url+tstr+pidstr+PID

    # Request file
    resp = requests.get(full_url, auth=(username, password))
    #    print(resp.status_code)
    # Print the response code (200 is good. If you get something else, may be
    # a password problem)

    # To print content: resp.content
    aa = resp.content.decode("utf-8")
    data_df = pd.read_csv(StringIO(aa))
    
    # Clean up column names
    tmp = data_df.columns.str.strip()
    tmp = tmp.str.replace(" ", "", regex=True)
    tmp = tmp.str.replace("-", "_", regex=True)
    tmp = tmp.str.replace("(", "_", regex=True)
    tmp = tmp.str.replace(')', '', regex=True)
    data_df.columns = tmp
    
    # Remove the </br> column (IF IT EXISTS)
    if '</br>' in data_df.columns:
        data_df = data_df.drop(columns='</br>')
        # data_df.dtypes
    
    # Convert the time column to a timestamp
    time_colname = 'Timestamp_UTC'
    data_df[time_colname] = pd.to_datetime(data_df[time_colname],
                                       format=timcol_strftime) 
    
    # Prep to convert xarray
    data_df2 = data_df
    data_df2["time"] = data_df2["Timestamp_UTC"].values
    data_df2 = data_df2.set_index("time")
    data_df2 = data_df2.drop(columns='Timestamp_UTC')
    # Convert to xarray
    ds = data_df2.to_xarray()

    return ds


In [11]:
###################-----------   USER EDITS    ------------###################

# Path where data are saved. Can be changed in file SIO_wrap/dir_tree.py
data_dir = dir_tree.dir_out

# Download start date must have format yyyy-mm-dd. Default is set to the
# beginning of the TERIFIC project, i.e 2019-12-04.
download_start_date = "2019-12-04"
print("\nDefault download start date: %s\n" % download_start_date)


# SIO username and password
username = "uk-noc"
password = "noc-drifter"

# URL for data
base_url = "https://gdp.ucsd.edu/cgi-bin/projects/uk-noc/drifter.py?" 

# Options

#full_url = base_url+tstr+pidstr
#download_url = ("https://gdp.ucsd.edu/cgi-bin/projects/uk-noc/"
#                "drifter.py?start_date=") 


# String formatting for time for:
#   - the download url, 
#   - appending to the filename
# 	- the data time column, respectively.
url_strftime = '%Y-%m-%d'
tstamp_strftime = '%Y%m%d'
timcol_strftime = '%Y-%m-%d %H:%M:%S'




Default download start date: 2019-12-04



In [12]:
# Get the list of Platform IDs
PID = pd.read_csv(cat_proc_path('PID_list.txt'), header='infer', index_col=0)


In [13]:
# Check which drifter files need to be updated
# Save the list in PID_to_update
PID_to_update = []
counter = 0
for i in range(len(PID)):
    
    # Get a single platform ID from the full list
    pid1 = (PID["PID"].values)[i].astype('str')
    PID1 = (PID["PID"].values)[i]

    # Extract a list with the names of existing raw data files.
    fname = 'pid'+str(PID1)+'_*'
    existing_files = glob.glob(cat_raw_path(fname))
    # Sort them alphabetically so the last element of the list is the 
    # latest date
    existing_files = sorted(existing_files)
    
    if len(existing_files) > 0:
        
        # Extract the end date from the filename
        end_date = (existing_files[-1])[-11:-3] # What order to these come in?
        t1 = datetime.datetime.strptime(end_date, '%Y%m%d')  

        # Download only the most recent drifter data 
        # (since the previous end date)
        download_update = t1.strftime('%Y-%m-%d')
        ds_update = load_one_drifter(pid1, base_url, username, password,
                                     download_update)

        # If there were at least 24 data points since the 
        # previous end date, then append the PID to the update list
        if len(ds_update["time"]) > 24:
            counter += 1
            maxtime = ds_update.time.max().values
            new_end_date = pd.to_datetime(maxtime).strftime('%Y%m%d')

            PID_to_update.append(PID1)
            print(str(counter)+'. pid('+pid1+') - Ended:'+end_date
                  +', New end:'+new_end_date)
        
    else:
        counter += 1
        print(str(counter)+'. pid('+pid1
              +') - No previous raw data files.\n')
        PID_to_update.append(PID1)

if counter==0:
    print('All drifter files are up-to-date')
    
print('No more files to update')

1. pid(300234066513050) - Ended:20220101, New end:20220101
2. pid(300234068349690) - Ended:20220111, New end:20220117
3. pid(300234068346620) - Ended:20220111, New end:20220117
4. pid(300234068343550) - Ended:20220111, New end:20220117
5. pid(300234068342020) - Ended:20220111, New end:20220117
6. pid(300234068349700) - Ended:20220111, New end:20220117
7. pid(300234068345610) - Ended:20220111, New end:20220117
8. pid(300234068243230) - Ended:20220111, New end:20220117
9. pid(300234068345630) - Ended:20220111, New end:20220117
10. pid(300234068348190) - Ended:20220111, New end:20220117
11. pid(300234066416930) - Ended:20220111, New end:20220117
12. pid(300234068244270) - Ended:20220111, New end:20220117
13. pid(300234068348210) - Ended:20220111, New end:20220117
14. pid(300234068343610) - Ended:20220111, New end:20220117
15. pid(300234068348220) - Ended:20220111, New end:20220117
16. pid(300234068346690) - Ended:20220111, New end:20220117
17. pid(300234068347720) - Ended:20220111, New en

In [14]:
# Loop through the list of Platform IDs of drifters that need to be updated
# 
counter = 0
for i in range(len(PID_to_update)):
    counter += 1


    
    PID1 = PID_to_update[i]
    pid1 = PID1.astype('str')
    
    # Load data into an xarray
    ds = load_one_drifter(pid1, base_url, username, password,
                               download_start_date)
    
    
    # Get values for attributes
    dstr = datetime.datetime.today()
    dstr = dstr.replace(hour=0, minute=0, second=0, 
                        microsecond=0).strftime('%Y-%m-%d')

    maxtime = ds.time.max().values
    maxtimestr = pd.to_datetime(maxtime).strftime('%Y-%m-%dT%H:%M:%S')


    project_name = 'TERIFIC'
    operator_name = "EFW"
    institution_name = 'National Oceanography Centre, UK'


    # Create a dictionary of attributes
    attr_dict = {"Platform_ID": PID1,
                 "End Time": maxtimestr,
                 "Project": project_name,
                 "Originator": operator_name,
                 "Institution": institution_name,
                 "Date created": dstr,
                }


    ds = ds.assign_attrs(attr_dict)

    ds = ds.drop('Platform_ID')
    
    # Sort by time ascending
    ds = ds.sortby('time', ascending=True)



    # Save file to raw - only if it doesn't already exist
    enddate = pd.to_datetime(maxtime).strftime('%Y%m%d')
    fname = 'pid'+str(PID1)+'_'+enddate+'.nc'
    
    outfile_with_path = cat_raw_path(fname)
    if not os.path.isfile(outfile_with_path):
        print(str(counter)+'. '+outfile_with_path)
        ds.to_netcdf(cat_raw_path(fname))
    else:
        print(str(counter)+'. '+outfile_with_path+' already exists!')



1. ../01-data/01-raw/pid300234066513050_20220101.nc already exists!
2. ../01-data/01-raw/pid300234068349690_20220117.nc
3. ../01-data/01-raw/pid300234068346620_20220117.nc
4. ../01-data/01-raw/pid300234068343550_20220117.nc
5. ../01-data/01-raw/pid300234068342020_20220117.nc
6. ../01-data/01-raw/pid300234068349700_20220117.nc
7. ../01-data/01-raw/pid300234068345610_20220117.nc
8. ../01-data/01-raw/pid300234068243230_20220117.nc
9. ../01-data/01-raw/pid300234068345630_20220117.nc
10. ../01-data/01-raw/pid300234068348190_20220117.nc
11. ../01-data/01-raw/pid300234066416930_20220117.nc
12. ../01-data/01-raw/pid300234068244270_20220117.nc
13. ../01-data/01-raw/pid300234068348210_20220117.nc
14. ../01-data/01-raw/pid300234068343610_20220117.nc
15. ../01-data/01-raw/pid300234068348220_20220117.nc
16. ../01-data/01-raw/pid300234068346690_20220117.nc
17. ../01-data/01-raw/pid300234068347720_20220117.nc
18. ../01-data/01-raw/pid300234068343630_20220117.nc
19. ../01-data/01-raw/pid30023406834619

In [None]:
# One further refinement - STILL TO BE IMPLEMENTED
#
# Instead of re-loading the full drifter dataset, use the existing previous 
# file and append the new data.

if 0:
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
    # crop data so that the last day is fully sampled and there are no
    # overlaps when the data are updated; basically discard the last day if
    # it's incomplete
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    end_datetime = pd.to_datetime(ds.time.values[-1])
    end_datestr = end_datetime.strftime(url_strftime)

    penultimate_datetime = end_datetime - timedelta(days=1)
    penultimate_datestr = penultimate_datetime.strftime(url_strftime)

    if download_start_date == end_datestr:
        sys.exit("No updated data. Last full day available is %s" 
            % penultimate_datestr)


    cutoff_date = pd.to_datetime(end_datestr +" 00:00:00",
                                 format=timcol_strftime)

    ds_crop = ds.where(ds.time<cutoff_date, drop=True)

    # timestamp for filename
    fname_timestamp = penultimate_datetime.strftime(tstamp_strftime)

    # stitch together the files
    if len(existing_files) > 0:
        print("Stitch updated dataset with the previous one. \n")
        # use previously opened dataset (prev_ds)
        # put both datasets in a list
        d = []
        d.append(prev_ds)
        d.append(ds_crop)

        # merge list into a dataset
        new_ds = xr.concat(d, dim='n')

    else:
        new_ds = ds_crop


    # Filename and path of (updated) dataset
    update_fname = f"{fnames.fname_rawdata}{fname_timestamp}.nc"
    update_fpath = os.path.join(data_dir, update_fname)

    # Save dataset to netcdf file
    new_ds.to_netcdf(update_fpath)



In [6]:
# Load all the TERIFIC data into raw files
if 0:
    for i in range(len(PID)):
        pid1 = (PID["PID"].values)[i].astype('str')
        PID1 = (PID["PID"].values)[i]

        # Load data into an xarray
        ds = load_one_drifter(pid1, base_url, username, password,
                                   download_start_date)


        # Get values for attributes
        dstr = datetime.datetime.today()
        dstr = dstr.replace(hour=0, minute=0, second=0, 
                            microsecond=0).strftime('%Y-%m-%d')

        maxtime = ds.time.max().values
        maxtimestr = pd.to_datetime(maxtime).strftime('%Y-%m-%dT%H:%M:%S')


        project_name = 'TERIFIC'
        operator_name = "EFW"
        institution_name = 'National Oceanography Centre, UK'


        # Create a dictionary of attributes
        attr_dict = {"Platform_ID": PID1,
                     "End Time": maxtimestr,
                     "Project": project_name,
                     "Originator": operator_name,
                     "Institution": institution_name,
                     "Date created": dstr,
                    }


        ds = ds.assign_attrs(attr_dict)

        ds = ds.drop('Platform_ID')

        # Save file to raw - only if it doesn't already exist
        enddate = pd.to_datetime(maxtime).strftime('%Y%m%d')
        fname = 'pid'+str(PID1)+'_'+enddate+'.nc'

        outfile_with_path = cat_raw_path(fname)
        if not os.path.isfile(outfile_with_path):
            ds.to_netcdf(cat_raw_path(fname))
        else:
            print(outfile_with_path+' already exists!')


../01-data/01-raw/pid300234066516050_20211117.nc already exists!
../01-data/01-raw/pid300234068343380_20210121.nc already exists!
../01-data/01-raw/pid300234068243550_20210822.nc already exists!
../01-data/01-raw/pid300234066514020_20211018.nc already exists!
../01-data/01-raw/pid300234066515050_20210428.nc already exists!
../01-data/01-raw/pid300234068348010_20220104.nc already exists!
../01-data/01-raw/pid300234066514030_20210525.nc already exists!
../01-data/01-raw/pid300234066514040_20210226.nc already exists!
../01-data/01-raw/pid300234066514050_20200213.nc already exists!
../01-data/01-raw/pid300234066513030_20211209.nc already exists!
../01-data/01-raw/pid300234068343430_20210404.nc already exists!
../01-data/01-raw/pid300234066416780_20220102.nc already exists!
../01-data/01-raw/pid300234068244620_20220104.nc already exists!
../01-data/01-raw/pid300234066513040_20201020.nc already exists!
../01-data/01-raw/pid300234066513050_20220101.nc already exists!
../01-data/01-raw/pid3002

KeyboardInterrupt: 