This script is used for:
- remove "pred points" from training data 

**How to launch this Jupyter notebook**:   
```bash
execcasper -A your_project -l gpu_type=v100 -l walltime=06:00:00 -l select=1:ncpus=18:mpiprocs=36:ngpus=1:mem=300GB
bash aws_urban_env.sh
```

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import xarray as xr
import gc
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import joblib
import statsmodels.api as sm

start_year = "2006"
end_year = "2015"
urban_LE_nc_path = "/glade/scratch/zhonghua/urban_params/urban_LE/"
parquet_save_path = "/glade/scratch/zhonghua/urban_params/urban_LE_random_split/"
urban_surf_path = "/glade/scratch/zhonghua/urban_params/urban_surface.parquet.gzip"

In [2]:
fd = {
    "label":"TREFMXAV_U",
    "CAM": ['FLNS','FSNS','PRECT','PRSN','QBOT','TREFHT','UBOT','VBOT'],
    "surf":['CANYON_HWR','EM_IMPROAD','EM_PERROAD','EM_ROOF','EM_WALL', 
            'HT_ROOF','THICK_ROOF','THICK_WALL','T_BUILDING_MAX','T_BUILDING_MIN',
            'WTLUNIT_ROOF','WTROAD_PERV','NLEV_IMPROAD','PCT_URBAN',
            'ALB_IMPROAD','ALB_PERROAD','ALB_ROOF','ALB_WALL',
            'TK_ROOF','TK_WALL','CV_ROOF','CV_WALL',
            'TK_IMPROAD_0','CV_IMPROAD_0','TK_IMPROAD_1','CV_IMPROAD_1'],
    "loc":["lat","lon"]
}

def get_merge_member(start_year, end_year, parquet_save_path):
    df_tmp_ls = []
    for member_id in tqdm(range(3, 34)):
        member = (str(member_id).zfill(3))
        df_tmp_ls.append(pd.read_parquet(parquet_save_path + "train/" + member + "_"\
                            + start_year + "_" + end_year + ".parquet.gzip", engine="fastparquet"))
    return pd.concat(df_tmp_ls)

# load data
urban_LE = get_merge_member(start_year, end_year, parquet_save_path)
urban_surf = pd.read_parquet(urban_surf_path, engine="fastparquet").reset_index()


# ========= remove points from training data =========

## known gridcell (lat, lon) <-> test gridcell (lat, lon)
known_gridcell = {
    "1": {"lat":32.51309, "lon":253.75},
    "2": {"lat":40.994766, "lon":277.5},
    "3": {"lat":40.994766, "lon":247.5}
}

pred_gridcell = {
    "1": {"lat":31.57068, "lon":253.75},
    "2": {"lat":41.937172, "lon":277.5},
    "3": {"lat":42.87958, "lon":247.5}
}

dd = []
for p in pred_gridcell:
    dd.append(urban_LE[(np.abs(urban_LE["lat"]-pred_gridcell[p]["lat"])<0.0001) & 
                       (np.abs(urban_LE["lon"]-pred_gridcell[p]["lon"])<0.0001)])
    
urban_LE_new = urban_LE.drop(pd.concat(dd).index).copy()

# check if we removed points successfully
dd = []
for p in pred_gridcell:
    dd.append(urban_LE_new[(np.abs(urban_LE_new["lat"]-pred_gridcell[p]["lat"])<0.0001) & 
                           (np.abs(urban_LE_new["lon"]-pred_gridcell[p]["lon"])<0.0001)])

print("check if three points are still in the dataframe")
print(dd)

print("number of removed samples:", urban_LE.shape[0] - urban_LE_new.shape[0])

# merge data
df = pd.merge(urban_LE_new, urban_surf, on = ["lat","lon"], how = "inner")
# check if we merge the data successfully
assert urban_LE_new.shape[0] == df.shape[0]

del urban_LE, urban_LE_new, urban_surf
gc.collect()

100%|███████████████████████████████████████████| 31/31 [00:07<00:00,  4.32it/s]


check if three points are still in the dataframe
[Empty DataFrame
Columns: [time, lat, lon, TREFMXAV_U, FLNS, FSNS, PRECT, PRSN, QBOT, TREFHT, UBOT, VBOT, member]
Index: [], Empty DataFrame
Columns: [time, lat, lon, TREFMXAV_U, FLNS, FSNS, PRECT, PRSN, QBOT, TREFHT, UBOT, VBOT, member]
Index: [], Empty DataFrame
Columns: [time, lat, lon, TREFMXAV_U, FLNS, FSNS, PRECT, PRSN, QBOT, TREFHT, UBOT, VBOT, member]
Index: []]
number of removed samples: 11206


0