Code to split data into train and test sets

In [7]:
import pandas as pd
import numpy as np
import zipfile
from datetime import datetime
import sys
sys.path.append('../')
from utils.get_season import get_season

In [5]:
#Read in dataframe
zf = zipfile.ZipFile("../../data/float_sat_normalized_depth.csv.zip") 
date_parser = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
df = pd.read_csv(zf.open('float_sat_normalized_depth.csv'),parse_dates=['date'], date_parser=date_parser)
df.head()

Unnamed: 0,float,latitude,longitude,cycleNumber,date,day_of_year_rad,PRES,PSAL,TEMP,BBP700,...,sat_Rrs_678,sat_aph_443,sat_par,sat_angstrom,sat_bb_678,sat_adg_443,sat_aot_869,normalized_depth,sat_adg_s,sat_bbp_s
0,1902303,49.236,-14.742,1,2021-05-06 02:03:16,2.167505,3,35.539001,12.378333,0.001788,...,0.000354,0.0357,27.891998,1.1175,0.001995,0.0083,0.0951,0.012931,0.018,1.600373
1,1902303,49.236,-14.742,1,2021-05-06 02:03:16,2.167505,4,35.539001,12.378333,0.001788,...,0.000354,0.0357,27.891998,1.1175,0.001995,0.0083,0.0951,0.017241,0.018,1.600373
2,1902303,49.236,-14.742,1,2021-05-06 02:03:16,2.167505,6,35.539001,12.378667,0.001788,...,0.000354,0.0357,27.891998,1.1175,0.001995,0.0083,0.0951,0.025862,0.018,1.600373
3,1902303,49.236,-14.742,1,2021-05-06 02:03:16,2.167505,8,35.539001,12.379,0.001789,...,0.000354,0.0357,27.891998,1.1175,0.001995,0.0083,0.0951,0.034483,0.018,1.600373
4,1902303,49.236,-14.742,1,2021-05-06 02:03:16,2.167505,10,35.539001,12.379,0.00179,...,0.000354,0.0357,27.891998,1.1175,0.001995,0.0083,0.0951,0.043103,0.018,1.600373


In [None]:
df['month'] = df['date'].apply(lambda x: x.month)

In [11]:
#Keep only necessary columsn for split:
df = df[['float','latitude','longitude','cycleNumber','month']].drop_duplicates()

In [12]:
#Add seasonality
#Capture seasonality in the data
month_seasons_dict = {
    1: [1, 3],
    2: [1, 3],
    3: [2, 4],
    4: [2, 4],
    5: [2, 4],
    6: [3, 1],
    7: [3, 1],
    8: [3, 1],
    9: [4, 2],
    10: [4, 2],
    11: [4,2],
    12: [1, 3],
}
#Dictionary for our own understanding of the seasons mapping
seasons_dict = {
    1: 'Winter',
    2: 'Spring',
    3: 'Summer',
    4: 'Fall'
}
#Add column for seasonality
df['season'] = df.apply(lambda x: get_season(x, month_seasons_dict), axis=1)

In [13]:
#Add bins to stratify the data cut into train/test
df['lon_bin'] = pd.cut(df['longitude'],np.arange(-180,190,10))
df['lat_bin'] = pd.cut(df['latitude'],np.arange(-90,100,10))

In [14]:
#Create the unique ID for each float/cycle - these will be the unique observations
df['float_cycle'] = df['float'].map(str)+'_'+df['cycleNumber'].map(str)

In [16]:
#Sample 1 observation for each season/lat/lon for the test set
test_set = df.groupby(['lon_bin','lat_bin','season']).apply(lambda x: x.sample(1)).reset_index(drop=True)
#Remove these observations from the train data
train_set = df[-df['float_cycle'].isin(test_set['float_cycle'].unique())]
print('test size: ',round(len(test_set)/(len(test_set)+len(train_set))*100,0),'%')
#Now pull from train_set and sample 18% of remaining obeservations for each season/lat/lon
test_set_new = train_set.groupby(['lon_bin','lat_bin','season']).apply(lambda x: x.sample(frac=.18)).reset_index(drop=True)
test_set = pd.concat([test_set,test_set_new])
#Remove these observations from the train data
train_set = train_set[-train_set['float_cycle'].isin(test_set['float_cycle'].unique())]
print('test size: ',round(len(test_set)/(len(test_set)+len(train_set))*100,0),'%')


test size:  2.0 %
test size:  20.0 %


In [19]:
#combine train and test into one dataframe with a binary indicator for train/test
train_set['train'] = 1
test_set['train'] = 0
full_df = pd.concat([train_set, test_set],axis=0)
full_df.head()


Unnamed: 0,float,latitude,longitude,cycleNumber,month,season,lon_bin,lat_bin,float_cycle,train
136,1902303,49.096,-14.617,2,5,2,"(-20, -10]","(40, 50]",1902303_2,1
285,1902303,48.914,-14.606,3,5,2,"(-20, -10]","(40, 50]",1902303_3,1
408,1902303,48.794,-14.689,4,5,2,"(-20, -10]","(40, 50]",1902303_4,1
493,1902303,48.719,-14.795,5,5,2,"(-20, -10]","(40, 50]",1902303_5,1
610,1902303,48.638,-14.899,6,5,2,"(-20, -10]","(40, 50]",1902303_6,1


In [20]:
full_df.to_csv('../../data/float_sat_normalized_depth_train_test_split.csv', index=False)