In [49]:
import pandas as pd
import matplotlib.pyplot as plt
from data_explore import viz

In [50]:
fname = '../data/SOCATv2024-1d_005deg-colloc-r20250224/SOCATv2024_1d_005deg_collocated_2015-r20250224.pq'
df = pd.read_parquet(fname)
df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sst_deg_c,sal,pppp_hpa,woa_sss,ncep_slp_hpa,dist_to_land_km,fco2rec_uatm,fco2rec_flag,n_samples,chl_globcolour,...,time_sstcci,lat_sstcci,lon_sstcci,sss_cci,sss_cci_random_error,lat_ssscci,lon_ssscci,time_ssscci,mld_dens_soda,time_avg
expocode,time_1d,lat_005,lon_005,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
06AQ20141203,2015-01-01,-70.525,351.925,-1.628297,34.254623,992.383057,33.980877,991.667175,67.0,387.908463,2,1005,0.210151,...,2015-01-01,-70.525002,351.924988,,,-70.625,351.875,2015-01-01,5.980542,2014-12-31 23:55:46.391049
06AQ20141203,2015-01-02,-70.575,350.925,-1.549885,34.321327,986.0,33.818001,985.900024,66.770493,382.748426,2,61,0.203763,...,2015-01-03,-70.574997,350.924988,,,-70.625,350.875,2015-01-02,15.35881,2015-01-02 11:11:53.311475
06AQ20141203,2015-01-02,-70.575,350.975,-0.8544,34.318066,986.0,33.818001,985.900024,68.26667,336.618267,2,15,0.197458,...,2015-01-03,-70.574997,350.975006,,,-70.625,350.875,2015-01-02,15.35881,2015-01-02 10:02:58.933333
06AQ20141203,2015-01-02,-70.525,350.975,-0.95675,34.309502,986.0,33.818001,985.900024,69.5,361.26825,2,4,0.214619,...,2015-01-03,-70.525002,350.975006,,,-70.625,350.875,2015-01-02,15.35881,2015-01-02 09:52:14.750000
06AQ20141203,2015-01-02,-70.525,351.025,-0.824909,34.325909,986.0,33.787998,985.900024,71.272728,337.921273,2,11,0.209839,...,2015-01-03,-70.525002,351.024994,,,-70.625,351.125,2015-01-02,17.02808,2015-01-02 09:43:53.000000


# calculate distances between current ship position and last position

In [3]:
import numpy as np
from haversine import haversine_vector


expocodes = df.index.get_level_values('expocode').unique()
cruise_id = expocodes[210]
cruise = df.loc[cruise_id].sort_values(by='time_avg')

In [4]:
cruise.head()

lats = cruise.index.get_level_values('lat_005').to_numpy().reshape(-1, 1)
lons = cruise.index.get_level_values('lon_005').to_numpy().reshape(-1, 1)
coords = np.concatenate([lons, lats], axis=1)
coords_prev = np.zeros_like(coords)
coords_prev[1:] = coords[:-1]
distances = haversine_vector(coords, coords_prev, normalize=True)
distances[0] = -1

In [5]:
ddf = pd.DataFrame(distances, index=cruise.index, columns=['d'])
bins = [-1, 1,4,6,10, 20, 50, 100, 200, 2000, 5000, 10000]
binned = pd.cut(ddf.d, bins)
bin_counts = binned.value_counts().sort_index()
bin_counts

d
(-1, 1]           91
(1, 4]             1
(4, 6]           139
(6, 10]            0
(10, 20]          10
(20, 50]           8
(50, 100]          6
(100, 200]         0
(200, 2000]        0
(2000, 5000]       0
(5000, 10000]      0
Name: count, dtype: int64

In [6]:
ddft = pd.concat([ddf, cruise['time_avg'], pd.DataFrame(range(len(ddf)), index=ddf.index, columns=['ix'])], axis=1)
cruise_feat = ddft # select the cruise that we want with satellite temperature data
print(ddft.head())


map = viz.plot_cruise_interactive_scatter_map(cruise_feat.d, cmap='coolwarm', bins=20)
map  # uncomment to show the map

                                   d                   time_avg  ix
time_1d    lat_005 lon_005                                         
2015-04-14 25.625  279.875 -1.000000 2015-04-13 13:18:35.000000   0
                   279.925  5.559754 2015-04-13 13:18:40.055555   1
           25.575  279.875  5.641327 2015-04-13 14:06:27.700000   2
           25.525  279.875  0.953494 2015-04-13 14:29:13.000000   3
           25.475  279.875  0.953494 2015-04-13 14:42:19.000000   4


In [45]:
cruise_feat.iloc[[178, 179]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d,time_avg
time_1d,lat_005,lon_005,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-04-16,25.325,278.375,67.053085,2015-04-16 07:00:39.071428
2015-04-16,24.975,278.925,61.436476,2015-04-16 07:07:52.000000


# Create training set
1. sort the entries for each cruise by time_avg to determine the coordinate sequence order, however there are some weird cases (2015, '33WA20150413')
2. calculate distances from every location to the location at the previous time step
3. for every entry, sum the distances up to that entry to derive total traveled distance
4. iterate through the locations and cut at 64*5 kms (or if two much time/space passed between two subsequent entries)
4. bin traveled distances in buckets of 5km width for each segment

In [140]:
from itertools import accumulate, chain
import numpy as np
from haversine import haversine_vector

def get_dtoprev(srt_cruise):
    # calculate distance in kms to previous locations
    coords = np.column_stack([
        srt_cruise.index.get_level_values('lon_005').to_numpy(),
        srt_cruise.index.get_level_values('lat_005').to_numpy()
    ])
    
    d = np.full(coords.shape[0], np.nan, dtype=np.float32)
    if coords.shape[0] > 1:
        d[1:] = haversine_vector(coords[1:], coords[:-1], normalize=True)
    
    return d

def divide_cruise(cruise, num_windows=64, len_window=5, max_time_delta=pd.Timedelta(days=1000), max_d_delta=np.inf):
    
    srt_cruise = cruise.sort_values(by='time_avg')
    d_diff = get_dtoprev(srt_cruise)
    time_diff = srt_cruise['time_avg'].diff()
    track_len= num_windows * len_window

    cs = 0
    segs = []
    cur_seg = [0]
    for (i, dprev) in enumerate(d_diff[1:]):
        cs += dprev
        # if the segment length exceeds 64*5 kms or the time jumps than max_time_delta or jumps more than max_d_delta kilometers
        # end the segment
        if cs >= track_len or time_diff.iloc[i] >= max_time_delta or dprev >= max_d_delta:
            segs.append(cur_seg)
            cur_seg = [0]
            cs = 0
        else:
            cur_seg.append(dprev.astype(np.float32))
    if cur_seg:
        segs.append(cur_seg)

    ix_segs = chain(*[[i]*len(seg) for (i, seg) in enumerate(segs)])
    cum_segs = chain(*[list(accumulate(seg)) for seg in segs])
    
    sdf = pd.DataFrame({'segment_id': list(ix_segs), 'track_length':list(cum_segs)}, index=srt_cruise.index)
    bins = np.arange(-len_window / 2., track_len + len_window, len_window)
    cut_sdf = pd.cut(sdf.track_length, bins=bins, labels=False)
    
    cruise['bin_segment_id'] = cut_sdf.values
    cruise['segment_id'] = sdf.segment_id
    return cruise

In [210]:
expocodes = df.index.get_level_values('expocode').unique()
# cruise_id = expocodes[1]
cruise_id = '06AQ20141203'
cruise = df.loc[cruise_id]
divide_cruise(cruise, num_windows=64, len_window=5, max_time_delta=pd.Timedelta(days=1))
map = viz.plot_cruise_interactive_scatter_map(cruise.segment_id ** ((cruise.segment_id % 2 == 0) * 100), cmap='viridis') # some dumb trick to get had color changes
# map = viz.plot_cruise_interactive_scatter_map(cruise['5kmtracks'], cmap='viridis')
map

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cruise['bin_segment_id'] = cut_sdf.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cruise['segment_id'] = sdf.segment_id


In [135]:
df2 = df.groupby(level=0, group_keys=False).apply(
    lambda cruise: divide_cruise(cruise,
                                 num_windows=64,
                                 len_window=5, 
                                 max_time_delta=pd.Timedelta(days=1)
                                 )
    )

In [142]:
df2.columns

Index(['sst_deg_c', 'sal', 'pppp_hpa', 'woa_sss', 'ncep_slp_hpa',
       'dist_to_land_km', 'fco2rec_uatm', 'fco2rec_flag', 'n_samples',
       'chl_globcolour', 'chl_globcolour_uncert', 'chl_globcolour_flags',
       'time_chlgc', 'lat_chlgc', 'lon_chlgc', 'temp_soda', 'salt_soda',
       'lon_soda', 'lat_soda', 'depth_soda', 'time_soda', 'ssh_adt', 'ssh_sla',
       'lat_sshcm', 'lon_sshcm', 'time_sshcm', 'sst_cci',
       'sst_cci_uncertainty', 'ice_cci', 'time_sstcci', 'lat_sstcci',
       'lon_sstcci', 'sss_cci', 'sss_cci_random_error', 'lat_ssscci',
       'lon_ssscci', 'time_ssscci', 'mld_dens_soda', 'time_avg',
       'bin_segment_id', 'segment_id'],
      dtype='object')

In [136]:
df2.head()
assert((df2.index == df.index).all())
print(df.index[:10])

MultiIndex([('06AQ20141203', '2015-01-01', -70.52499999999999, 351.925),
            ('06AQ20141203', '2015-01-02', -70.57499999999999, 350.925),
            ('06AQ20141203', '2015-01-02', -70.57499999999999, 350.975),
            ('06AQ20141203', '2015-01-02', -70.52499999999999, 350.975),
            ('06AQ20141203', '2015-01-02', -70.52499999999999, 351.025),
            ('06AQ20141203', '2015-01-02', -70.52499999999999, 351.075),
            ('06AQ20141203', '2015-01-02', -70.52499999999999, 351.925),
            ('06AQ20141203', '2015-01-02', -70.52499999999999, 351.975),
            ('06AQ20141203', '2015-01-02', -70.52499999999999, 352.025),
            ('06AQ20141203', '2015-01-02',            -70.475, 351.075)],
           names=['expocode', 'time_1d', 'lat_005', 'lon_005'])


In [143]:
df_grouped = df2.groupby([pd.Grouper(level=0), 'segment_id', 'bin_segment_id']).mean()

In [213]:
def df_to_numpy(df2):
    binned = df2.groupby([pd.Grouper(level=0), 'segment_id', 'bin_segment_id']).mean() # first bin all values in the 5km buckets
    # index level <binned> : [expocode, segment_id, bin_segment_id]
    bins_per_seg = binned.groupby([pd.Grouper(level=0), pd.Grouper(level=1)]).size() # number of non-empty buckets per segment
    num_segs_tot = bins_per_seg.size # total number of segments in dataset
    segs_per_expocode = bins_per_seg.groupby(level=0).size() # number of segments per expocode
    
    offsets = segs_per_expocode.values.cumsum() # offsets to index segments in expocode table
    offsets_expocode = offsets - offsets[0] # offsets at expocode level
    offsets_seg = np.repeat(offsets_expocode, segs_per_expocode.values)
    offsets_seg = offsets_seg + bins_per_seg.index.get_level_values(level=1).to_numpy() # offset for each segment
    offsets_seg_per_bin = np.repeat(offsets_seg, bins_per_seg) # offset for each bin (row index in dataset does not change)

    dataset = np.full((num_segs_tot, 65), np.nan, dtype=np.float32)
    bin_ids = binned.index.get_level_values(level=2).to_numpy()
    dataset[offsets_seg_per_bin, bin_ids] = binned.fco2rec_uatm.values

    return dataset


    

In [214]:
df_to_numpy(df2)

array([[387.90848,       nan, 382.7484 , ...,       nan,       nan,
              nan],
       [295.94403, 276.03098,       nan, ...,       nan,       nan,
              nan],
       [348.27432,       nan,       nan, ..., 322.56458, 336.46738,
        356.4901 ],
       ...,
       [      nan,       nan,       nan, ...,       nan,       nan,
              nan],
       [      nan,       nan,       nan, ...,       nan,       nan,
              nan],
       [      nan,       nan,       nan, ...,       nan,       nan,
              nan]], shape=(14514, 65), dtype=float32)

In [145]:
df_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sst_deg_c,sal,pppp_hpa,woa_sss,ncep_slp_hpa,dist_to_land_km,fco2rec_uatm,fco2rec_flag,n_samples,chl_globcolour,...,time_sstcci,lat_sstcci,lon_sstcci,sss_cci,sss_cci_random_error,lat_ssscci,lon_ssscci,time_ssscci,mld_dens_soda,time_avg
expocode,segment_id,bin_segment_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
06AQ20141203,0,0,-1.628297,34.254623,992.383057,33.980877,991.667175,67.0,387.908463,2.0,1005.0,0.210151,...,2015-01-01,-70.525002,351.924988,,,-70.625,351.875,2015-01-01,5.980542,2014-12-31 23:55:46.391049
06AQ20141203,0,2,-1.549885,34.321327,986.0,33.818001,985.900024,66.770493,382.748426,2.0,61.0,0.203763,...,2015-01-03,-70.574997,350.924988,,,-70.625,350.875,2015-01-02,15.35881,2015-01-02 11:11:53.311475
06AQ20141203,0,4,-0.8544,34.318066,986.0,33.818001,985.900024,68.26667,336.618267,2.0,15.0,0.197458,...,2015-01-03,-70.574997,350.975006,,,-70.625,350.875,2015-01-02,15.35881,2015-01-02 10:02:58.933333
06AQ20141203,0,6,-0.95675,34.309502,986.0,33.818001,985.900024,69.5,361.26825,2.0,4.0,0.214619,...,2015-01-03,-70.525002,350.975006,,,-70.625,350.875,2015-01-02,15.35881,2015-01-02 09:52:14.750000
06AQ20141203,0,7,-1.110143,34.240715,983.714294,33.787998,983.799988,83.571426,321.854857,2.0,7.0,0.228982,...,2015-01-03,-70.425003,351.274994,,,-70.375,351.375,2015-01-03,13.481891,2015-01-03 02:22:46.571428


In [195]:
df_segs = df_grouped.groupby([pd.Grouper(level=0), pd.Grouper(level=1)]).size()
num_segs_tot = df_segs.size
print(num_segs_tot)
df_segs

14514


expocode      segment_id
06AQ20141203  0             30
              1             22
              2             43
              3             40
              4             32
                            ..
PAT520151219  19            46
              20            10
PAT520151229  0             50
              1             48
              2             35
Length: 14514, dtype: int64

In [196]:
num_segs_expocode = df_segs.groupby(level=0).size()
offsets = num_segs_expocode.values.cumsum()
offsets = offsets - offsets[0]

In [197]:
dataset = np.full((num_segs_tot, 65), np.nan, dtype=np.float32)
rep_offsets = np.repeat(offsets, num_segs_expocode.to_numpy())
rep_offsets


array([    0,     0,     0, ..., 14484, 14484, 14484], shape=(14514,))

In [200]:
segs_ids = df_segs.index.get_level_values(level=1).to_numpy()
segs_indices = segs_ids + rep_offsets
bin_segs_indices = np.repeat(segs_indices, df_segs.values)
df_grouped['row_index'] = bin_segs_indices

In [212]:
bin_ids = df_grouped.index.get_level_values(level=2).to_numpy()
dataset[bin_segs_indices, bin_ids] = df_grouped.fco2rec_uatm.values

dataset[50]

array([422.8451 , 417.99057, 418.454  ,       nan, 417.5498 , 417.6143 ,
       417.22122, 417.3547 , 417.22366, 417.40704, 417.44568,       nan,
       416.91327, 417.09857, 416.927  , 416.83926, 417.02682,       nan,
       418.18524, 418.34744, 418.45074, 418.70264, 418.52774, 418.5302 ,
       418.64786, 418.4645 , 418.24072,       nan, 418.193  , 418.5995 ,
       418.4621 ,       nan,       nan,       nan, 418.69513, 419.54926,
       419.1335 , 420.182  , 420.1306 , 420.0617 , 420.14026,       nan,
       420.1706 , 420.0063 , 420.9452 , 421.83084, 421.854  ,       nan,
       423.2661 , 423.6641 , 423.42667, 423.916  , 423.74023, 424.1955 ,
       425.03653, 425.05344, 425.9457 ,       nan, 425.74432, 426.35526,
             nan, 427.461  , 427.65442,       nan,       nan],
      dtype=float32)

In [217]:
d1 = df_to_numpy(df2)

In [227]:
np.allclose(d1, dataset, equal_nan=True)

True

In [225]:
dataset[10]

array([404.649  , 401.59775, 400.09772, 398.1974 , 395.316  ,       nan,
       398.77853, 397.7773 , 395.6705 , 393.32266,       nan,       nan,
             nan, 385.468  ,       nan, 384.1415 , 381.7126 , 377.5074 ,
       369.05   , 366.71487, 363.24228, 371.45084, 370.6782 , 367.8244 ,
             nan, 358.50217, 339.88   , 346.7485 , 351.58667, 340.3112 ,
       339.0455 , 337.3294 , 334.1951 , 339.0975 ,       nan, 342.77   ,
       335.72415,       nan,       nan, 355.242  , 355.46866, 351.141  ,
             nan, 329.07   , 320.90216, 324.6996 , 332.1556 , 345.90375,
       333.16208, 343.7648 , 368.42813, 375.85892,       nan, 371.26593,
       365.84366, 364.79764, 362.8316 , 374.68564,       nan,       nan,
             nan, 318.85403, 301.33765, 275.34622, 261.2719 ],
      dtype=float32)

In [224]:
d1[10]

array([404.649  , 401.59775, 400.09772, 398.1974 , 395.316  ,       nan,
       398.77853, 397.7773 , 395.6705 , 393.32266,       nan,       nan,
             nan, 385.468  ,       nan, 384.1415 , 381.7126 , 377.5074 ,
       369.05   , 366.71487, 363.24228, 371.45084, 370.6782 , 367.8244 ,
             nan, 358.50217, 339.88   , 346.7485 , 351.58667, 340.3112 ,
       339.0455 , 337.3294 , 334.1951 , 339.0975 ,       nan, 342.77   ,
       335.72415,       nan,       nan, 355.242  , 355.46866, 351.141  ,
             nan, 329.07   , 320.90216, 324.6996 , 332.1556 , 345.90375,
       333.16208, 343.7648 , 368.42813, 375.85892,       nan, 371.26593,
       365.84366, 364.79764, 362.8316 , 374.68564,       nan,       nan,
             nan, 318.85403, 301.33765, 275.34622, 261.2719 ],
      dtype=float32)

In [216]:
assert((df_to_numpy(df2) == dataset).all())

AssertionError: 

In [201]:
df_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sst_deg_c,sal,pppp_hpa,woa_sss,ncep_slp_hpa,dist_to_land_km,fco2rec_uatm,fco2rec_flag,n_samples,chl_globcolour,...,lat_sstcci,lon_sstcci,sss_cci,sss_cci_random_error,lat_ssscci,lon_ssscci,time_ssscci,mld_dens_soda,time_avg,row_index
expocode,segment_id,bin_segment_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
06AQ20141203,0,0,-1.628297,34.254623,992.383057,33.980877,991.667175,67.0,387.908463,2.0,1005.0,0.210151,...,-70.525002,351.924988,,,-70.625,351.875,2015-01-01,5.980542,2014-12-31 23:55:46.391049,0
06AQ20141203,0,2,-1.549885,34.321327,986.0,33.818001,985.900024,66.770493,382.748426,2.0,61.0,0.203763,...,-70.574997,350.924988,,,-70.625,350.875,2015-01-02,15.35881,2015-01-02 11:11:53.311475,0
06AQ20141203,0,4,-0.8544,34.318066,986.0,33.818001,985.900024,68.26667,336.618267,2.0,15.0,0.197458,...,-70.574997,350.975006,,,-70.625,350.875,2015-01-02,15.35881,2015-01-02 10:02:58.933333,0
06AQ20141203,0,6,-0.95675,34.309502,986.0,33.818001,985.900024,69.5,361.26825,2.0,4.0,0.214619,...,-70.525002,350.975006,,,-70.625,350.875,2015-01-02,15.35881,2015-01-02 09:52:14.750000,0
06AQ20141203,0,7,-1.110143,34.240715,983.714294,33.787998,983.799988,83.571426,321.854857,2.0,7.0,0.228982,...,-70.425003,351.274994,,,-70.375,351.375,2015-01-03,13.481891,2015-01-03 02:22:46.571428,0


In [124]:
import numpy as np
from haversine import haversine_vector

def get_dtoprev(srt_cruise):
    coords = np.column_stack([
        srt_cruise.index.get_level_values('lon_005').to_numpy(),
        srt_cruise.index.get_level_values('lat_005').to_numpy()
    ])
    
    d = np.full(coords.shape[0], np.nan, dtype=np.float32)
    if coords.shape[0] > 1:
        d[1:] = haversine_vector(coords[1:], coords[:-1], normalize=True)
    
    return d

srt_cruise = cruise.sort_values(by='time_avg')
d = get_dtoprev(srt_cruise)
print(np.nanmax(d))


67.053085


In [126]:

time_diff = srt_cruise['time_avg'].diff()
cs = 0
segs = []
cur_seg = [0]
threshold  = 64 * 5
for (i, dprev) in enumerate(d[1:]):
    cs += dprev
    if cs >= threshold or time_diff.iloc[i + 1] > pd.Timedelta(days=1):
        segs.append(cur_seg)
        cur_seg = [0]
        cs = 0
    else:
        cur_seg.append(dprev)
if cur_seg:
    segs.append(cur_seg)


In [130]:
i = 2
print(segs[i])
print(sum(segs[i]))

[0, np.float32(5.559754), np.float32(22.239016), np.float32(27.79877), np.float32(5.559754), np.float32(22.239016), np.float32(22.255), np.float32(5.559754), np.float32(5.559754), np.float32(5.6258974), np.float32(61.29826), np.float32(61.360184), np.float32(5.8169913), np.float32(28.114922), np.float32(33.623917)]
312.61096


In [145]:
from itertools import accumulate, chain
ix_segs = chain(*[[i]*len(seg) for (i, seg) in enumerate(segs)])
cum_segs = chain(*[list(accumulate(seg)) for seg in segs])

sdf = pd.DataFrame({'segment_id': list(ix_segs), 'track_length':list(cum_segs)}, index=cruise.index)
max_track = 64 * 5
bins = np.arange(-2.5, max_track + 5, 5)
print(bins)

cut_sdf = pd.cut(sdf.track_length, bins=bins, labels=False)

sdf.track_length.max()


[ -2.5   2.5   7.5  12.5  17.5  22.5  27.5  32.5  37.5  42.5  47.5  52.5
  57.5  62.5  67.5  72.5  77.5  82.5  87.5  92.5  97.5 102.5 107.5 112.5
 117.5 122.5 127.5 132.5 137.5 142.5 147.5 152.5 157.5 162.5 167.5 172.5
 177.5 182.5 187.5 192.5 197.5 202.5 207.5 212.5 217.5 222.5 227.5 232.5
 237.5 242.5 247.5 252.5 257.5 262.5 267.5 272.5 277.5 282.5 287.5 292.5
 297.5 302.5 307.5 312.5 317.5 322.5]


319.2693786621094

In [149]:
cut_sdf.head(100)

time_1d     lat_005  lon_005
2015-04-14  25.625   279.875     0
                     279.925     1
            25.575   279.875     2
            25.525   279.875     2
            25.475   279.875     3
                                ..
2015-04-15  25.375   277.825    10
                     277.775    11
            25.425   277.775    11
                     277.825    12
            25.475   277.825    12
Name: track_length, Length: 100, dtype: int64

In [151]:
print(len(bins))
map = viz.plot_cruise_interactive_scatter_map(sdf.segment_id, cmap='coolwarm', bins=len(bins))
map

66


# create horizonal tessellation
- every cell has 64*5 km width
- there is a row of cells every 0.5 degrees latitude
- latitude range: [-89.975, 89.975] (should be the same as some of the satellite data grid)

In [None]:
import numpy as np
import math

# mean earth radius - https://en.wikipedia.org/wiki/Earth_radius#Mean_radius
earth_radius = 6371.0088

lat_grid = np.arange(-89.975, 90, 0.05)
# for every latitude scale the circle on the earths surface parallel to the equator.
scaled_rads = np.cos(lat_grid / 360.0 * (2*math.pi)) * earth_radius
# for every fixed latitude calculate the length in longitudes of 5 kms
kms_in_lons = (5. / scaled_rads) / (2*math.pi) * 360
# cells per latitude
num_cells = 360 / kms_in_lons

0.044966022466674874


In [None]:
grid = []
num_cells_ceil = num_cells.astype(int) + 1
# calculate longitudinal coordinates along each latitude
for (i, lon) in enumerate(kms_in_lons):
    # num_cells_ceil contains the number of 5km segments around the earth at a given latitude (with circles paralles to the equator)
    n_5k_segments = num_cells_ceil[i]
    coords_lat = np.zeros(n_5k_segments)
    coords_lat[:] = lon * np.arange(0, n_5k_segments, 1, dtype=np.float32)
    # resize array so we get the chunkc of 64 cells
    coords64 = np.nan * np.ones(64 * np.ceil((n_5k_segments / 64), dtype=np.int32))
    coords64[:n_5k_segments] = coords_lat
    grid.append(coords64.reshape(-1, 64))

In [None]:
# create lat lon tuples to used as pandas multindex
lat_index = []
block_index = []
for lat, lons in zip(lat_grid, grid):
    for i in range(len(lons)):
        lat_index += [(lat, lon) for lon in lons[i] if not np.isnan(lon)]
        block_index += [i] * len(lat_index)

In [41]:
multi_index = pd.MultiIndex.from_tuples(lat_index, names=['lat_005', 'lon'])
grid_df = pd.DataFrame(block_index, index=multi_index, columns=["block_index"])

In [None]:
grid_df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,block_index
lat_005,lon,Unnamed: 2_level_1
-89.975,0.0,0
-89.975,103.054526,0
-89.975,206.109052,0
-89.975,309.163577,0
-89.925,0.0,0
-89.925,34.351517,0
-89.925,68.703035,0
-89.925,103.054552,0
-89.925,137.406069,0
-89.925,171.757587,0


In [None]:
import sys
sys.getsizeof(grid_df)

944904652