This script is used for:
- spliting the Urban-LE data into training and testing data
- saving the Urban-LE training and testing data as parquet  

Reference: how to split data into training and testing (https://www.geeksforgeeks.org/divide-a-pandas-dataframe-randomly-in-a-given-ratio/)

How to launch it:   
```bash
execcasper -A your_project -l gpu_type=v100 -l walltime=06:00:00 -l select=1:ncpus=18:mpiprocs=36:ngpus=1:mem=300GB
bash aws_urban_env.sh
```

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import xarray as xr
import gc
from tqdm import tqdm

urban_LE_nc_path = "/glade/scratch/zhonghua/urban_params/urban_LE/"
parquet_save_path = "/glade/scratch/zhonghua/urban_params/urban_LE_random_split/"

In [2]:
def save_train_test(member, start_year, end_year, urban_LE_nc_path, parquet_save_path):
    # convert the time to datetime format
    ds_urban_LE = xr.open_dataset(urban_LE_nc_path+member+"_"+start_year+"_"+end_year+".nc")
    ds_urban_LE = ds_urban_LE.assign_coords(time = ds_urban_LE.indexes['time'].to_datetimeindex())
    df = ds_urban_LE.to_dataframe()
    
    del ds_urban_LE
    gc.collect()
    
    # remove missing value based on urban temperature
    df_final = df[~np.isnan(df["TREFMXAV_U"])].reset_index()
    df_final["member"] = member
    
    # split into training and testing
    df_train = df_final.sample(frac=0.1/3, random_state=int(member))
    df_test = df_final.drop(df_train.index)
    
    df_train.to_parquet(parquet_save_path + "train/" + member + "_"\
                        + start_year + "_" + end_year + ".parquet.gzip",
                        compression="gzip", engine="fastparquet")
    
    df_test.to_parquet(parquet_save_path + "test/" + member + "_"\
                        + start_year + "_" + end_year + ".parquet.gzip",
                        compression="gzip", engine="fastparquet")
    
    return

In [None]:
## about 2.5 mins for a member

for member_id in tqdm(range(2, 34)):
    print("============")
    member = (str(member_id).zfill(3))
    save_train_test(member, "2006", "2015", urban_LE_nc_path, parquet_save_path)
    save_train_test(member, "2061", "2070", urban_LE_nc_path, parquet_save_path)
    print("finish" + member)

**merge the data** (from "003" to "033")

In [2]:
def get_merge_member(start_year, end_year, parquet_save_path):
    df_tmp_ls = []
    for member_id in tqdm(range(3, 34)):
        member = (str(member_id).zfill(3))
        df_tmp_ls.append(pd.read_parquet(parquet_save_path + "train/" + member + "_"\
                            + start_year + "_" + end_year + ".parquet.gzip", engine="fastparquet"))
    return pd.concat(df_tmp_ls)
#     df.to_parquet(parquet_save_path + "train_urban_LE_"
#                   + start_year + "_" + end_year + ".parquet.gzip",
#                   compression="gzip", engine="fastparquet")

In [3]:
df = get_merge_member("2006", "2015", parquet_save_path)
print(df.shape)

100%|███████████████████████████████████████████| 31/31 [00:08<00:00,  3.57it/s]


(16737830, 13)
