In [1]:
import geopandas as gpd
import os
import xarray as xr
import numpy as np
import pandas as pd
import copy
import pickle
import gc
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
#import leafmap
import leafmap.foliumap as leafmap
from shapely.geometry import mapping
import pyproj
import folium
from glob import glob
# sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
data_dir = '/mnt/CEPH_PROJECTS/sao/openEO_Platform'

In [3]:
pv_geoms = gpd.read_file(f"{data_dir}/old_data/data/shapefiles/photovoltaic.shp")

In [4]:
threshold_area = 5000 #m^2 ~50mx100m
pv_geoms_32632 = pv_geoms.to_crs(32632)
big_pv_geoms_32632 = pv_geoms_32632.where(pv_geoms_32632["geometry"].area > threshold_area).dropna()
big_pv_geoms = big_pv_geoms_32632.to_crs(4326)
print(f"Number of selected PV Farms: {len(big_pv_geoms)}")

Number of selected PV Farms: 43


In [5]:
pv_index = big_pv_geoms.index

In [6]:
bands = ['B02','B03','B04',
         'B05','B06','B07','B08',
         'B8A', 'B11', 'B12']

### Yearly median

In [8]:
from tqdm import tqdm
def temp_s2_data(pv_farms, invert=False):
    
    all_pixels_ts = []
    for pv_farm in tqdm(pv_farms):
        farm_id = int(pv_farm.split('/')[-1].split('_')[-1].split('.')[0])
        #data = xr.open_dataset(f"{data_dir}/s2/germany/s2_2022_{farm_id}.nc",decode_coords="all")
        data = xr.open_dataset(f"{data_dir}/old_data/data/netcdfs/S2_2022_{farm_id}.nc",decode_coords="all")
        crs = data.rio.crs
        geom = pv_geoms.loc[farm_id]
         # Filter clouds pixels
        cloud_mask = np.bitwise_or(np.bitwise_or((data.SCL == 8),(data.SCL == 9)),(data.SCL == 3))
        data = data.where(~cloud_mask)  
        data = data.drop_vars(["SCL","B01"])

        yearly_median = data.median(dim="t").to_dataarray()

        # Crop the farm
        geodf = gpd.GeoDataFrame(geometry=[geom["geometry"]],crs=4326).to_crs(crs)
        yearly_median.rio.write_crs(crs, inplace=True)
        clipped = yearly_median.rio.clip(geodf.geometry.values, geodf.crs, drop=False, invert=invert)    

        ds_np = clipped.transpose("y","x","variable").values.reshape(len(clipped["y"])*len(clipped["x"]),len(clipped["variable"]))
        
        all_pixels_ts.append(ds_np)
    
    samples = np.vstack(all_pixels_ts)
    
    return samples

In [9]:
pv_farms = glob(f"{data_dir}/old_data/data/netcdfs/*.nc")
pv_samples = temp_s2_data(pv_farms)
non_pv_samples = temp_s2_data(pv_farms,invert=True)

100%|███████████████████████████████████████████| 42/42 [00:03<00:00, 11.77it/s]
100%|███████████████████████████████████████████| 42/42 [00:02<00:00, 18.23it/s]


In [10]:
y1 = np.ones(pv_samples.shape[0])
y0 = np.zeros(non_pv_samples.shape[0])

te_samples = np.vstack([pv_samples, non_pv_samples])
te_samples = np.nan_to_num(te_samples, nan=-999999)

y_test = np.hstack([y1, y0])

In [12]:
import pickle
import urllib.request
with open("/home/mclaus@eurac.edu/openEO_photovoltaic/models/temporal_models/rf_1_median_depth_15.sav","rb") as f:
    clf = pickle.load(f)

In [13]:
te_pred = clf.predict(te_samples)
test_score = accuracy_score(te_pred, y_test)
print(f'Test Accuracy Score: {test_score}')

Test Accuracy Score: 0.7421523992092494


In [14]:
te_pv_samples = np.nan_to_num(pv_samples, nan=-999999)

te_pred = clf.predict(te_pv_samples)

test_score = accuracy_score(te_pred, y1)
print(f'Test Accuracy Score for PV pixels: {test_score}')

Test Accuracy Score for PV pixels: 0.8027616366141497
