In [44]:
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
import os
import glob
import re
import sys
from datetime import timedelta
import datetime

In [4]:
# Local import 
# > Make sure SIO_wrap dir is on the same path as this script.

from SIO_wrap import dir_tree, fnames



/Users/eddifying/Python/drifters/ already exists
/Users/eddifying/Python/drifters/02-code/ already exists
/Users/eddifying/Python/drifters/02-code/SIO_wrap/ already exists
/Users/eddifying/Python/drifters/01-data/04-aux/ already exists
/Users/eddifying/Python/drifters/01-data/02-processed/ created


In [14]:

####################---------   LOCAL FUNCTIONS    ---------####################
def mysplit(s, delim=None):
	"""
	Parameters
	----------
	s 		: string 
	delim 	: delimiter is a string, default separator is None 

	The built-in split() method splits a string into a list but does not 
	ignore empty strings and when applying it for the TERIFIC GDP data 
	it was creating an additional empty column.  This functions removes the
	empty strings. [last checked: Aug 2021]

	Returns
	-------
	List of strings where the elements are the substrings separated 
	by the specifed delimiter.
	"""
	return [x for x in s.split(delim) if x]



In [5]:
####################-----------   USER EDITS    ------------####################

# Path where data are saved. Can be changed in file SIO_wrap/dir_tree.py
data_dir = dir_tree.dir_out

# SIO username and password
username = "uk-noc"
password = "noc-drifter"

# Download URL (main body without the start date)
download_url = ("https://gdp.ucsd.edu/cgi-bin/projects/uk-noc/"
                "drifter.py?start_date=") 

# Download start date must have format yyyy-mm-dd. Default is set to the
# beginning of the TERIFIC project, i.e 2019-12-04.
download_start_date = "2019-12-04"
print("\nDefault download start date: %s\n" % download_start_date)

# String formatting for time for:
#   - the download url, 
#   - appending to the filename
# 	- the data time column, respectively.
url_strftime = '%Y-%m-%d'
tstamp_strftime = '%Y%m%d'
timcol_strftime = '%Y-%m-%d %H:%M:%S'




Default download start date: 2019-12-04



In [6]:
# Extract a list with the names of existing raw data files.
existing_files = glob.glob(os.path.join(data_dir, fnames.fname_rawdata + '*'))

# ~ ~ print update ~ ~ 
if len(existing_files) > 0:
    print("Existing raw data files: \n%s" % existing_files)
else:
    print("No previous raw data files.\n")

# ~ ~ filenaming convention ~ ~
# If there are multiple files with raw data (i.e. non-updated datasets), select 
# the latest one updated.
# The file names are distinguished by the timestamp appended to the filename 
# and has <tstamp_strftime> format (see 'user edits' section).
# The data are cropped such that the last day is fully sampled (spans 0h-23h).
# The timestamp in the filename is the latest downloaded fully sampled day.

if len(existing_files) > 0:

    # Extract the timestamp part of the filename(s) in a list
    tstamp = [date for file in existing_files 
    			for date in re.findall("(\d{8})", file)]

    # Convert to datetime and pick the most recent timestamp
    tstamp_date = pd.to_datetime(tstamp, format=tstamp_strftime)
    prev_fname_timestamp = tstamp[tstamp_date.argmax()]

    # Load the previously updated file
    prev_fname = f"{fnames.fname_rawdata}{prev_fname_timestamp}.nc"
    prev_fpath = os.path.join(data_dir, prev_fname)
    prev_ds = xr.open_dataset(prev_fpath)

    # Make sure the time variable is sorted in ascending order
    #prev_time = prev_ds.time.sortby(prev_ds.time)

    # Set download start date to +1 day from the last day of previous dataset
    latest_date = tstamp_date.max()
    download_start_date = (latest_date 
                           + timedelta(days=1)).strftime(url_strftime)

    print("Download start date changed to: %s\n" % download_start_date)

# Combine body of download link with start date
data_url = download_url + download_start_date

No previous raw data files.



In [8]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# Scrape data from website 
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

print("Scraping data from the website starting from: %s \n.......\n" 
        % download_start_date)

data_html = requests.get(data_url, auth=(username, password))
# If print(data_html) outputs <Response [200]> then code worked
# To print content: data_html.content


# Extract text from the html page
print("Parsing web data ...")

data_soup = BeautifulSoup(data_html.text, "html.parser") 
data_text = data_soup.text


# Split the text after every newline character '\n' into separate rows
print("Splitting text into separate rows ...")

data_rows = data_text.splitlines() # basically CSV



Scraping data from the website starting from: 2019-12-04 
.......

Parsing web data ...
Splitting text into separate rows ...


In [12]:
data_rows[0:3] # basically CSV

['Platform-ID, Timestamp(UTC), GPS-Latitude(deg), GPS-Longitude(deg), SST(degC), SLP(mB), Battery(volts), Drogue (cnts), GPS-HDOP, GPS-FixDelay, GPS-TTFF, GPS-NumSat, SBD-Transmit-Delay, SBD-Retries, ',
 '300234066416410, 2020-09-01 00:00:00, 53.799, -48.206, 12.27, 850.00, 10.2, 39, 1.00, 0, 1, 6, 10, 0, ',
 '300234066416410, 2020-08-31 23:00:00, 53.786, -48.201, 12.28, 850.00, 10.2, 36, 1.00, 0, 1, 8, 0, 0, ']

In [15]:
# Further split the data into columns, but first:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# Edit the header
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
#
# The first row contains the header.  This is repeated for every drifter but
# because we can ID the drifters by the Platform ID, we remove the header lines.
# We use the header to name the columns of data, but first we remove unwanted
# characters (spaces/parantheses/dashes). 

header_raw = data_rows[0]

print("\nHeader format before processing:\n%s\n" % header_raw)

# Remove header lines
data_rows_clean = [x for x in data_rows if header_raw not in x]

# Remove unwanted characters from the header. 
# [!!!] These might change if there are new columns added/names change. 
header = header_raw.replace(" ", "")	
header = header.replace("-",  "_")
header = header.replace("(", "_")
header = header.replace(")", "")

# Split the header into columns
col_names = mysplit(header, ',')

print("\nHeader after removing unwanted spaces and characters:\n%s\n" % 
      col_names)

# check header matches the lists of integer/float names
#if all(item in col_names for item in integ_vars)==False:
#	print("List of integer var names does not match the column names")
#	print("Check variable %s" % str(integ_vars))
#if all(item in col_names for item in float_vars)==False:
#	print("List of float var names does not match the column names")




Header format before processing:
Platform-ID, Timestamp(UTC), GPS-Latitude(deg), GPS-Longitude(deg), SST(degC), SLP(mB), Battery(volts), Drogue (cnts), GPS-HDOP, GPS-FixDelay, GPS-TTFF, GPS-NumSat, SBD-Transmit-Delay, SBD-Retries, 


Header after removing unwanted spaces and characters:
['Platform_ID', 'Timestamp_UTC', 'GPS_Latitude_deg', 'GPS_Longitude_deg', 'SST_degC', 'SLP_mB', 'Battery_volts', 'Drogue_cnts', 'GPS_HDOP', 'GPS_FixDelay', 'GPS_TTFF', 'GPS_NumSat', 'SBD_Transmit_Delay', 'SBD_Retries']



In [16]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# Store data in a pandas dataframe 
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# > Split each row of data into columns; delimiter: comma and space (', ')

print("Create a pandas dataframe....")
df = []		# create a list

for i in range(len(data_rows_clean)):
	df.append(mysplit(data_rows_clean[i], ', ')) 

# > Assign each column a name using the edited header
data_df = pd.DataFrame(df, columns=col_names)

#print("Data stored in a pandas dataframe. Data fields: \n%s" % data_df.keys())

# > Change the formatting of the time column from text to datetime[64]
# extract name of column that contains time
#time_colname = data_df.filter(like=('Time' or 'time')).columns 
time_colname = 'Timestamp_UTC'
data_df[time_colname] = pd.to_datetime(data_df[time_colname],
                                       format=timcol_strftime)
#print(data_df)

# > Sort rows by time
print("Sorting rows by time ..\n")
data_df = data_df.sort_values(by=time_colname)

#data_df = data_df.iloc[:1200000]

Create a pandas dataframe....
Sorting rows by time ..



In [30]:
data_df

Unnamed: 0,Platform_ID,Timestamp_UTC,GPS_Latitude_deg,GPS_Longitude_deg,SST_degC,SLP_mB,Battery_volts,Drogue_cnts,GPS_HDOP,GPS_FixDelay,GPS_TTFF,GPS_NumSat,SBD_Transmit_Delay,SBD_Retries
353188,300234066513060,2019-12-04 20:09:00,60.169,-47.039,-3.48,850.00,10.8,60,5.00,0,33,5,5,0
416074,300234066515010,2019-12-04 20:09:00,60.169,-47.038,-3.43,850.00,11.4,43,3.00,0,27,5,5,0
335207,300234066513050,2019-12-04 20:09:00,60.169,-47.039,-3.79,850.00,11.4,47,23.00,0,39,4,5,0
386391,300234066514030,2019-12-04 20:13:00,60.160,-47.051,-3.60,850.00,11.4,49,20.00,0,55,5,45,1
335206,300234066513050,2019-12-04 20:13:00,60.161,-47.049,-3.18,850.00,11.4,57,2.00,0,18,7,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1567023,300234068345490,2022-01-08 07:01:00,51.565,-53.826,-0.03,850.00,10.2,46,5.00,0,73,4,20,0
1384183,300234068343500,2022-01-08 07:01:00,57.244,-48.455,3.55,850.00,9.4,27,3.00,0,74,5,5,0
1594219,300234068346100,2022-01-08 07:01:00,61.747,-58.823,2.01,850.00,10.2,49,3.00,0,71,4,10,0
1096605,300234068342020,2022-01-08 07:02:00,42.618,-30.499,15.22,850.00,10.0,26,10.00,0,116,4,10,0


In [36]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# convert pandas dataset to xarray dataset 
# (easier to save as netcdf file)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# Define a dictionary and first populate it with the float variables.
# Treat the time variable separately because it has type datetime64[ns].
#
# Dictionary uses an ordinate (n) as a coordinate; decided not to use the time 
# because it does not have unique values although this can be changed.
print("Converting DataFrame to xarray Dataset..\n")
dd = {}		# create an empty dict

for coln in col_names:
	if coln != time_colname:
		var = (pd.to_numeric(data_df[coln]))
		#print(var)
		dd[coln] = ("time", data_df[coln].astype(var.dtype).values)


# changed the time variable name to 'time'
dd["time"] = ("time", data_df[time_colname].values)

# xarray dataset
ds = xr.Dataset(dd)
#ds = xr.Dataset(dd,
#	coords={"time" : dd["time"]})  # Changed the coordinate


Converting DataFrame to xarray Dataset..



In [38]:
ds

In [49]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# crop data so that the last day is fully sampled and there are no overlaps
# when the data are updated; basically discard the last day if it's incomplete
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

end_datetime = pd.to_datetime(ds.time.values[-1])
end_datestr = end_datetime.strftime(url_strftime)

penultimate_datetime = end_datetime - timedelta(days=1)
penultimate_datestr = penultimate_datetime.strftime(url_strftime)

if download_start_date == end_datestr:
	sys.exit("No updated data. Last full day available is %s" 
		% penultimate_datestr)


cutoff_date = pd.to_datetime(end_datestr +" 00:00:00", format=timcol_strftime)

ds_crop = ds.where(ds.time<cutoff_date, drop=True)

# timestamp for filename
fname_timestamp = penultimate_datetime.strftime(tstamp_strftime)

# stitch together the files
if len(existing_files) > 0:
	print("Stitch updated dataset with the previous one. \n")
	# use previously opened dataset (prev_ds)
	# put both datasets in a list
	d = []
	d.append(prev_ds)
	d.append(ds_crop)

	# merge list into a dataset
	new_ds = xr.concat(d, dim='time')

else:
	new_ds = ds_crop


# Filename and path of (updated) dataset
update_fname = f"{fnames.fname_rawdata}{fname_timestamp}.nc"
update_fpath = os.path.join(data_dir, update_fname)


new_ds.attrs["filename"] = update_fname
new_ds.attrs["Created"] = datetime.datetime.now().strftime('%Y/%m/%d')
#new_ds.attrs["long_name"] = "latitude_lowess"


# Save dataset to netcdf file
#new_ds.to_netcdf(update_fpath)

print("File updated/saved: %s" % update_fpath)
print("Script 1 finished. \n")



File updated/saved: /Users/eddifying/Python/drifters/01-data/02-processed/drifter_data_raw_20220107.nc
Script 1 finished. 



In [50]:
new_ds