This script is used for:
- save the training and testing data for new scenarios

**How to launch this Jupyter notebook**:   
```bash
execcasper -A your_project -l gpu_type=v100 -l walltime=06:00:00 -l select=1:ncpus=18:mpiprocs=36:ngpus=1:mem=300GB
bash aws_urban_env.sh
```

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import xarray as xr
import gc
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import joblib
import statsmodels.api as sm

start_year = "2061"
end_year = "2070"
urban_LE_nc_path = "/glade/scratch/zhonghua/urban_params/urban_LE/"
parquet_save_path = "/glade/scratch/zhonghua/urban_params/urban_LE_scenarios/"

In [2]:
fd = {
    "label":"TREFMXAV_U",
    "CAM": ['FLNS','FSNS','PRECT','PRSN','QBOT','TREFHT','UBOT','VBOT']
}


## known gridcell (lat, lon) <-> test gridcell (lat, lon)
known_gridcell = {
    "1": {"lat":32.51309, "lon":253.75},
    "2": {"lat":40.994766, "lon":277.5},
    "3": {"lat":40.994766, "lon":247.5}
}

pred_gridcell = {
    "1": {"lat":31.57068, "lon":253.75},
    "2": {"lat":41.937172, "lon":277.5},
    "3": {"lat":42.87958, "lon":247.5}
}

In [3]:
# load all data across ten years for the same location
def get_data(gridcell, p, fn, urban_LE_nc_path, start_year, end_year, parquet_save_path):
    df_tmp_ls = []
    for member_id in tqdm(range(3, 34)):
            member = (str(member_id).zfill(3))
            ds_urban_LE = xr.open_dataset(urban_LE_nc_path+member+"_"+start_year+"_"+end_year+".nc")\
                            .sel(lat=gridcell[p]["lat"],lon=gridcell[p]["lon"])
            ds_urban_LE = ds_urban_LE.assign_coords(time = ds_urban_LE.indexes['time'].to_datetimeindex())
            df_tmp = ds_urban_LE.to_dataframe()
            df_tmp["member"] = member
            df_tmp_ls.append(df_tmp.copy())
            del ds_urban_LE, df_tmp
            gc.collect()
            
    pd.concat(df_tmp_ls).to_parquet(parquet_save_path+fn+p+".parquet.gzip", engine="pyarrow")
    return 

for p in ["1","2","3"]:
    print(p)
    get_data(known_gridcell, p, "train_", urban_LE_nc_path, start_year, end_year, parquet_save_path)
    get_data(pred_gridcell, p, "test_", urban_LE_nc_path, start_year, end_year, parquet_save_path)

1


100%|██████████████████████████████████████████████████████████████████████| 31/31 [01:15<00:00,  2.43s/it]
100%|██████████████████████████████████████████████████████████████████████| 31/31 [01:20<00:00,  2.61s/it]


2


100%|██████████████████████████████████████████████████████████████████████| 31/31 [01:27<00:00,  2.83s/it]
100%|██████████████████████████████████████████████████████████████████████| 31/31 [01:15<00:00,  2.45s/it]


3


100%|██████████████████████████████████████████████████████████████████████| 31/31 [01:18<00:00,  2.55s/it]
100%|██████████████████████████████████████████████████████████████████████| 31/31 [01:12<00:00,  2.34s/it]


In [4]:
df = pd.read_parquet(parquet_save_path+"train_"+"1.parquet.gzip", engine="pyarrow")
df

Unnamed: 0_level_0,TREFMXAV_U,FLNS,FSNS,PRECT,PRSN,QBOT,TREFHT,UBOT,VBOT,lat,lon,member
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2061-01-01,287.400970,53.877895,120.386955,4.307333e-09,2.842670e-24,0.005592,280.922974,-1.290794,-1.913611,32.513088,253.75,003
2061-01-02,285.612000,25.114357,44.966625,3.100421e-08,5.041787e-15,0.005618,279.850647,-2.800561,1.027094,32.513088,253.75,003
2061-01-03,282.653046,31.471537,76.842049,5.770141e-08,1.129796e-13,0.006334,280.439362,1.879166,-0.150583,32.513088,253.75,003
2061-01-04,284.680389,67.393425,123.532364,7.840738e-10,3.932968e-17,0.005061,278.946350,0.890715,-0.983915,32.513088,253.75,003
2061-01-05,283.559418,84.674614,124.384392,1.164163e-09,3.396835e-15,0.003717,276.666229,-1.714217,-2.017547,32.513088,253.75,003
...,...,...,...,...,...,...,...,...,...,...,...,...
2070-12-27,287.624969,67.000000,76.199219,1.346564e-09,0.000000e+00,0.004242,281.500000,0.578125,-1.343750,32.513088,253.75,033
2070-12-28,289.332153,98.500000,126.078125,2.498476e-16,0.000000e+00,0.003448,280.500000,0.039551,-1.039062,32.513088,253.75,033
2070-12-29,288.519043,77.500000,85.859375,1.050523e-09,0.000000e+00,0.003418,280.375000,-1.609375,1.226562,32.513088,253.75,033
2070-12-30,287.714203,55.250000,64.142578,5.824973e-09,0.000000e+00,0.005310,281.875000,-0.949219,1.414062,32.513088,253.75,033
