In [5]:
import numpy as np
import os

from functions.parse_data import synth_dataloader
from sklearn.model_selection import train_test_split


In [6]:
#Load all data
df = synth_dataloader(path_name='SMHIdata',drop_cols=False)
#Add extra index column
df = df.reset_index(drop=False)
#Drop clear bands
Clear_bands = ['Clear_B01','Clear_B02','Clear_B03','Clear_B04','Clear_B05','Clear_B06',
               'Clear_B07','Clear_B08','Clear_B08A','Clear_B09','Clear_B10','Clear_B11','Clear_B12']
df = df.drop(columns=Clear_bands)
#Split data 0.8/0.1/0.1
train_df, testval_df = train_test_split(df, test_size = 0.2, random_state=313)
test_df, val_df = train_test_split(testval_df, test_size = 0.5, random_state = 313)

#Inspect
test_df.head()

Unnamed: 0,index,Cloud_B01,Cloud_B02,Cloud_B03,Cloud_B04,Cloud_B05,Cloud_B06,Cloud_B07,Cloud_B08,Cloud_B08A,...,Cloud_B12,Sat_Zenith_Angle,Sun_Zenith_Angle,Azimuth_Diff_Angle,COT,Cloud_Type,Profile_ID,GOT,Water_Vapor,Surface_Desc
33041,33041,0.42695,0.39256,0.35196,0.33563,0.3683,0.62516,0.67945,0.57703,0.70829,...,0.19983,12.15,74.17,83.98,9.906,5,5534,0.129,4.52,vegetation-shrub-adenostoma
146492,146492,0.60893,0.59968,0.57348,0.57995,0.6225,0.8565,0.91485,0.80163,0.9515,...,0.09166,7.32,56.26,145.71,11.221,6,6796,0.128,4.04,vegetation-tree-hetrosideros
150424,150424,0.42143,0.38972,0.35145,0.34019,0.41114,0.75842,0.7623,0.74442,0.76608,...,0.19581,10.48,71.29,49.11,5.593,8,9712,0.123,0.21,vegetation-tree-eucalyptus
4073,4073,0.5573,0.54976,0.58008,0.52779,0.71681,0.85288,0.87048,0.82273,0.88328,...,0.4589,7.66,61.54,139.49,14.352,3,375,0.128,0.77,vegetation-shrub-umbellularia
199487,199487,0.93375,0.90982,0.83276,0.88521,0.88713,0.90275,0.91153,0.87603,0.91553,...,0.1101,13.32,63.32,23.86,27.489,7,3967,0.125,0.46,rock-igneous-felsic


In [7]:
#Choose path
main_path = 'cot_train/data/synthetic-cot-data'
#Save dataframes (so we can find indices again)
train_df.to_csv(main_path+'/train_df.csv')
val_df.to_csv(main_path+'/val_df.csv')
test_df.to_csv(main_path+'/test_df.csv')
#Turn to numpy
train_np = train_df.to_numpy()
val_np = val_df.to_numpy()
test_np = test_df.to_numpy()
#Save as np
np.save(main_path+'/trainset_smhi',train_np,allow_pickle=True)
np.save(main_path+'/valset_smhi',val_np,allow_pickle=True)
np.save(main_path+'/testset_smhi',test_np,allow_pickle=True)

In [8]:
#10% of data
train_df_01 = train_df.iloc[: -int(train_df.shape[0] * 0.9), :]
val_df_01 = val_df.iloc[: -int(val_df.shape[0] * 0.9), :]
test_df_01 = test_df.iloc[: -int(test_df.shape[0] * 0.9), :]

#Choose path
main_path = 'cot_train/data/synthetic-cot-data_01'
os.makedirs(main_path,exist_ok=True)
#Turn to numpy
train_np = train_df_01.to_numpy()
val_np = val_df_01.to_numpy()
test_np = test_df_01.to_numpy()
#Save as np
np.save(main_path+'/trainset_smhi',train_np,allow_pickle=True)
np.save(main_path+'/valset_smhi',val_np,allow_pickle=True)
np.save(main_path+'/testset_smhi',test_np,allow_pickle=True)

In [9]:
#20% of data
train_df_02 = train_df.iloc[: -int(train_df.shape[0] * 0.8), :]
val_df_02 = val_df.iloc[: -int(val_df.shape[0] * 0.8), :]
test_df_02 = test_df.iloc[: -int(test_df.shape[0] * 0.8), :]

#Choose path
main_path = 'cot_train/data/synthetic-cot-data_02'
os.makedirs(main_path,exist_ok=True)
#Turn to numpy
train_np = train_df_02.to_numpy()
val_np = val_df_02.to_numpy()
test_np = test_df_02.to_numpy()
#Save as np
np.save(main_path+'/trainset_smhi',train_np,allow_pickle=True)
np.save(main_path+'/valset_smhi',val_np,allow_pickle=True)
np.save(main_path+'/testset_smhi',test_np,allow_pickle=True)

In [10]:
#30% of data
train_df_03 = train_df.iloc[: -int(train_df.shape[0] * 0.7), :]
val_df_03 = val_df.iloc[: -int(val_df.shape[0] * 0.7), :]
test_df_03 = test_df.iloc[: -int(test_df.shape[0] * 0.7), :]

#Choose path
main_path = 'cot_train/data/synthetic-cot-data_03'
os.makedirs(main_path,exist_ok=True)
#Turn to numpy
train_np = train_df_03.to_numpy()
val_np = val_df_03.to_numpy()
test_np = test_df_03.to_numpy()
#Save as np
np.save(main_path+'/trainset_smhi',train_np,allow_pickle=True)
np.save(main_path+'/valset_smhi',val_np,allow_pickle=True)
np.save(main_path+'/testset_smhi',test_np,allow_pickle=True)

In [11]:
#40% of data
train_df_04 = train_df.iloc[: -int(train_df.shape[0] * 0.6), :]
val_df_04 = val_df.iloc[: -int(val_df.shape[0] * 0.6), :]
test_df_04 = test_df.iloc[: -int(test_df.shape[0] * 0.6), :]

#Choose path
main_path = 'cot_train/data/synthetic-cot-data_04'
os.makedirs(main_path,exist_ok=True)
#Turn to numpy
train_np = train_df_04.to_numpy()
val_np = val_df_04.to_numpy()
test_np = test_df_04.to_numpy()
#Save as np
np.save(main_path+'/trainset_smhi',train_np,allow_pickle=True)
np.save(main_path+'/valset_smhi',val_np,allow_pickle=True)
np.save(main_path+'/testset_smhi',test_np,allow_pickle=True)

In [12]:
#50% of data
train_df_05 = train_df.iloc[: -int(train_df.shape[0] * 0.5), :]
val_df_05 = val_df.iloc[: -int(val_df.shape[0] * 0.5), :]
test_df_05 = test_df.iloc[: -int(test_df.shape[0] * 0.5), :]

#Choose path
main_path = 'cot_train/data/synthetic-cot-data_05'
os.makedirs(main_path,exist_ok=True)
#Turn to numpy
train_np = train_df_05.to_numpy()
val_np = val_df_05.to_numpy()
test_np = test_df_05.to_numpy()
#Save as np
np.save(main_path+'/trainset_smhi',train_np,allow_pickle=True)
np.save(main_path+'/valset_smhi',val_np,allow_pickle=True)
np.save(main_path+'/testset_smhi',test_np,allow_pickle=True)