In [21]:
import geopandas as gpd
import os
import xarray as xr
import numpy as np
import pandas as pd
import copy
import pickle
import gc
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
#import leafmap
import leafmap.foliumap as leafmap
from shapely.geometry import mapping
import pyproj
import folium
from glob import glob
# sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
from tsai.all import *

In [4]:
data_dir = '/mnt/CEPH_PROJECTS/sao/openEO_Platform'

In [5]:
pv_geoms = gpd.read_file(f"{data_dir}/old_data/data/shapefiles/photovoltaic.shp")

In [6]:
threshold_area = 5000 #m^2 ~50mx100m
pv_geoms_32632 = pv_geoms.to_crs(32632)
big_pv_geoms_32632 = pv_geoms_32632.where(pv_geoms_32632["geometry"].area > threshold_area).dropna()
big_pv_geoms = big_pv_geoms_32632.to_crs(4326)
print(f"Number of selected PV Farms: {len(big_pv_geoms)}")

Number of selected PV Farms: 43


In [12]:
pv_index = big_pv_geoms.index

In [13]:
bands = ['B01','B02','B03','B04',
         'B05','B06','B07','B08',
         'B8A', 'B11', 'B12']

In [22]:
pv_farms = glob(f"{data_dir}/old_data/data/netcdfs/*.nc")
#f"{data_dir}/old_data/data/netcdfs/S2_2022_{farm_id}.nc"

### Monthly aggregation

In [66]:
def temp_s2_data(stat_name, pv_farms, start_date='2022-01-01', end_date='2022-12-31', train=True, invert=False):
    
    all_pixels_ts = []
    for pv_farm in tqdm(pv_farms):
        farm_id = int(pv_farm.split('/')[-1].split('_')[-1].split('.')[0])
        #data = xr.open_dataset(f"{data_dir}/s2/germany/s2_2022_{farm_id}.nc",decode_coords="all")
        data = xr.open_dataset(f"{data_dir}/old_data/data/netcdfs/S2_2022_{farm_id}.nc",decode_coords="all")
        geom = big_pv_geoms_32632.loc[farm_id]


         # Filter clouds pixels
        cloud_mask = np.bitwise_or((data.SCL == 8),(data.SCL == 9))
        geodf = gpd.GeoDataFrame(geometry=[geom["geometry"]],crs="EPSG:32632")
        data = data.where(~cloud_mask)  

        # This step to fill the gaps (nan values) spatially
        data = data.groupby('t').apply(lambda x: x.fillna(x.mean(dim=('x','y'))))
        data.rio.write_crs("epsg:32633", inplace=True) #32633   

        # Filter the dataset based on time [Feb, Nov]
        filtered_ds = data.sel(t=slice(start_date, end_date))
            
        # Monthly stats 
        ds_monthly = filtered_ds.resample({"t": "M"}).apply(statistics_functions[stat_name])      
        
            
        # Linear interpolate for missing values 
        ds_monthly_interp = ds_monthly.interpolate_na(dim='t', method='linear')           
            
        # Crop the farm 
        clipped = ds_monthly_interp.rio.clip(geodf.geometry.values, geodf.crs, drop=False, invert=invert)    
        
        if train:
            df = clipped.to_dataframe().unstack(level='t')
        else:
            df = ds_monthly_interp.to_dataframe().unstack(level='t')
            
            
        arr_ts= []
        for b in bands:
            df_b = df[b]
            if train:
                df_b = df_b.dropna(how = 'all')
            if invert:
                # Randomly select 25% of the pixels 
                rows_to_filter = int(0.25 * len(df_b))
                df_b = df_b.sample(n=rows_to_filter, random_state=42)
            if df_b.shape[0] > 0 and df_b.shape[1] == 12:
                arr_ts.append(np.array(df_b))
        if len(arr_ts) > 0:
            all_pixels_ts.append(np.stack(arr_ts, axis=1))
        
    
    samples = np.vstack(all_pixels_ts)
    
    return samples

In [67]:
# Define a list of statistics functions
statistics_functions = {
    "mean": lambda x: x.mean(dim="t",skipna=True),
    "median": lambda x: x.median(dim="t",skipna=True),
    "std": lambda x: x.std(dim="t",skipna=True),
}


# Define the list of statistics names
statistics_names = list(statistics_functions.keys())

In [68]:
# Initialize a dictionary to store the results
results = {}

# Loop through each statistic
for stat_name in statistics_names:
    ds_monthly = temp_s2_data(stat_name, pv_farms) 
    # Store the result in the dictionary
    results[stat_name] = ds_monthly 
    
pv_samples= np.concatenate(list(results.values()), axis=-1)
np.save(f'{data_dir}/s2/germany/extracted_temporal_data/Austria_pv_samples_monthly.npy', pv_samples)

100%|███████████████████████████████████████████| 42/42 [00:30<00:00,  1.38it/s]
100%|███████████████████████████████████████████| 42/42 [00:32<00:00,  1.28it/s]
100%|███████████████████████████████████████████| 42/42 [00:30<00:00,  1.36it/s]


In [69]:
# Initialize a dictionary to store the results
results = {}

# Loop through each statistic
for stat_name in statistics_names:
    # Resample the dataset to monthly means using the current statistic function
    ds_monthly = temp_s2_data(stat_name, pv_farms, invert=True) 
    # Store the result in the dictionary
    results[stat_name] = ds_monthly 
    
te_samples= np.concatenate(list(results.values()), axis=-1)
np.save(f'{data_dir}/s2/germany/extracted_temporal_data/Austria_non_pv_samples_monthly.npy', te_samples)

100%|███████████████████████████████████████████| 42/42 [00:30<00:00,  1.36it/s]
100%|███████████████████████████████████████████| 42/42 [00:32<00:00,  1.28it/s]
100%|███████████████████████████████████████████| 42/42 [00:31<00:00,  1.34it/s]


In [70]:
# load the model
clf = pickle.load(open(f'../models/temporal_models/rf_monthly_temporal_1.sav', 'rb'))

In [71]:
te_pv_samples = np.load(f'{data_dir}/s2/germany/extracted_temporal_data/Austria_pv_samples_monthly.npy')
te_non_pv_samples = np.load(f'{data_dir}/s2/germany/extracted_temporal_data/Austria_non_pv_samples_monthly.npy')

y1 = np.ones(te_pv_samples.shape[0])
y0 = np.zeros(te_non_pv_samples.shape[0])

te_samples = np.vstack([te_pv_samples, te_non_pv_samples])
te_samples = te_samples.reshape(te_samples.shape[0], te_samples.shape[1]*te_samples.shape[2])
te_samples = np.nan_to_num(te_samples, nan=-999999)

y_test = np.hstack([y1, y0])

In [72]:
te_pred = clf.predict(te_samples)
test_score = accuracy_score(te_pred, y_test)
print(f'Test Accuracy Score: {test_score}')

Test Accuracy Score: 0.6292644135188867


In [73]:
te_pv_samples = np.load(f'{data_dir}/s2/germany/extracted_temporal_data/Austria_pv_samples_monthly.npy')
te_pv_samples = te_pv_samples.reshape(te_pv_samples.shape[0], te_pv_samples.shape[1]*te_pv_samples.shape[2])
te_pv_samples = np.nan_to_num(te_pv_samples, nan=-999999)

te_pred = clf.predict(te_pv_samples)

test_score = accuracy_score(te_pred, y1)
print(f'Test Accuracy Score for PV pixels: {test_score}')

Test Accuracy Score for PV pixels: 0.44471673634135794


### Weekly aggregation

In [83]:
def temp_s2_data(stat_name, pv_farms, start_date='2022-01-01', end_date='2022-12-31', train=True, invert=False):
    
    all_pixels_ts = []
    for pv_farm in tqdm(pv_farms):
        farm_id = int(pv_farm.split('/')[-1].split('_')[-1].split('.')[0])
        data = xr.open_dataset(f"{data_dir}/old_data/data/netcdfs/S2_2022_{farm_id}.nc",decode_coords="all")
        geom = big_pv_geoms_32632.loc[farm_id]


         # Filter clouds pixels
        cloud_mask = np.bitwise_or((data.SCL == 8),(data.SCL == 9))
        geodf = gpd.GeoDataFrame(geometry=[geom["geometry"]],crs="EPSG:32632")
        data = data.where(~cloud_mask)  

        # This step to fill the gaps (nan values) spatially
        data = data.groupby('t').apply(lambda x: x.fillna(x.mean(dim=('x','y'))))
        data.rio.write_crs("epsg:32633", inplace=True) #32633   

        # Filter the dataset based on time [Feb, Nov]
        filtered_ds = data.sel(t=slice(start_date, end_date))
            
        # Weekly stats 
        ds_weekly = filtered_ds.resample({"t": "W"}).apply(statistics_functions[stat_name])      
        ds_weekly.rio.write_crs("epsg:32633", inplace=True)
            
        # Linear interpolate for missing values 
        ds_weekly_interp = ds_weekly.interpolate_na(dim='t', method='linear')           
            
        # Crop the farm 
        clipped = ds_weekly_interp.rio.clip(geodf.geometry.values, geodf.crs, drop=False, invert=invert)    
        
        if train:
            df = clipped.to_dataframe().unstack(level='t')
        else:
            df = ds_weekly_interp.to_dataframe().unstack(level='t')
            
            
        arr_ts= []
        for b in bands:
            df_b = df[b]
            if train:
                df_b = df_b.dropna(how = 'all')
            if invert:
                # Randomly select 25% of the pixels 
                rows_to_filter = int(0.25 * len(df_b))
                df_b = df_b.sample(n=rows_to_filter, random_state=42)
                
            # Filter the dataset based on time [Feb, Nov]
            if df_b.shape[1] >= 50:
                arr_ts.append(np.array(df_b)[:, 4:50])
        if len(arr_ts) > 0:
            all_pixels_ts.append(np.stack(arr_ts, axis=1))
        
    
    samples = np.vstack(all_pixels_ts)
    
    return samples

In [84]:
# Initialize a dictionary to store the results
results = {}

# Loop through each statistic
for stat_name in statistics_names:
    # Resample the dataset to monthly means using the current statistic function
    ds_weekly = temp_s2_data(stat_name, pv_farms)
    
    # Store the result in the dictionary
    results[stat_name] = ds_weekly 
    

te_samples= np.concatenate(list(results.values()), axis=-1)
np.save(f'{data_dir}/s2/germany/extracted_temporal_data/Austria_te_pv_samples_weekly.npy', te_samples)

100%|███████████████████████████████████████████| 42/42 [00:37<00:00,  1.12it/s]
100%|███████████████████████████████████████████| 42/42 [00:43<00:00,  1.04s/it]
100%|███████████████████████████████████████████| 42/42 [00:38<00:00,  1.10it/s]


In [85]:
# Initialize a dictionary to store the results
results = {}

# Loop through each statistic
for stat_name in statistics_names:
    # Resample the dataset to monthly means using the current statistic function
    ds_weekly = temp_s2_data(stat_name, pv_farms, invert=True) 
    # Store the result in the dictionary
    results[stat_name] = ds_weekly  
    
non_pv_samples= np.concatenate(list(results.values()), axis=-1)
np.save(f'{data_dir}/s2/germany/extracted_temporal_data/Austria_non_pv_samples_weekly.npy', non_pv_samples)

100%|███████████████████████████████████████████| 42/42 [00:37<00:00,  1.12it/s]
100%|███████████████████████████████████████████| 42/42 [00:43<00:00,  1.04s/it]
100%|███████████████████████████████████████████| 42/42 [00:38<00:00,  1.10it/s]


In [86]:
# load the model from disk
clf = pickle.load(open(f'../models/temporal_models/rf_weekly_1.sav', 'rb'))

te_pv_samples = np.load(f'{data_dir}/s2/germany/extracted_temporal_data/Austria_te_pv_samples_weekly.npy')
te_non_pv_samples = np.load(f'{data_dir}/s2/germany/extracted_temporal_data/Austria_non_pv_samples_weekly.npy') 


y1 = np.ones(te_pv_samples.shape[0])
y0 = np.zeros(te_non_pv_samples.shape[0])

te_samples = np.vstack([te_pv_samples, te_non_pv_samples])
te_samples = te_samples.reshape(te_samples.shape[0], te_samples.shape[1]*te_samples.shape[2])
te_samples = np.nan_to_num(te_samples, nan=-999999)

y_test = np.hstack([y1, y0])

In [87]:
te_pred = clf.predict(te_samples)
test_score = accuracy_score(te_pred, y_test)
print(f'Test Accuracy Score: {test_score}')

Test Accuracy Score: 0.6844520940906483


In [88]:
te_pv_samples = np.load(f'{data_dir}/s2/germany/extracted_temporal_data/Austria_te_pv_samples_weekly.npy')
te_pv_samples = te_pv_samples.reshape(te_pv_samples.shape[0], te_pv_samples.shape[1]*te_pv_samples.shape[2])
te_pv_samples = np.nan_to_num(te_pv_samples, nan=-999999)


te_pred = clf.predict(te_pv_samples)

test_score = accuracy_score(te_pred, y1)
print(f'Test Accuracy Score for PV pixels: {test_score}')

Test Accuracy Score for PV pixels: 0.64


### Daily 

In [92]:
def s2_data(pv_farms, start_date='2022-01-01', end_date='2022-12-31',train=True, invert=False):
    
    all_pixels_ts = []
    for pv_farm in pv_farms:
        farm_id = int(pv_farm.split('/')[-1].split('_')[-1].split('.')[0])
        data = xr.open_dataset(f"{data_dir}/old_data/data/netcdfs/S2_2022_{farm_id}.nc",decode_coords="all")
        geom = big_pv_geoms_32632.loc[farm_id]


         ## Filter clouds pixels
        cloud_mask = np.bitwise_or((data.SCL == 8),(data.SCL == 9))
        geodf = gpd.GeoDataFrame(geometry=[geom["geometry"]],crs="EPSG:32632")
        data = data.where(~cloud_mask)  

        ## This step to fill the gaps (nan values) spatially
        data = data.groupby('t').apply(lambda x: x.fillna(x.mean(dim=('x','y'))))
        data.rio.write_crs("epsg:32633", inplace=True) #32633   

        ## Linear interpolation to have full time-series
        daily_date_range = pd.date_range(start=pd.to_datetime(start_date), 
                                         end=pd.to_datetime(end_date),
                                         freq='D')    

        ds_daily = data.reindex(t=daily_date_range)
        ds_daily_interp = ds_daily.interpolate_na(dim='t', method='linear')   


        ## Clip the farm polygon
        clipped = ds_daily_interp.rio.clip(geodf.geometry.values, geodf.crs, drop=False, invert=invert)


        if train:
            df = clipped.to_dataframe().unstack(level='t')
        else:
            df = ds_daily_interp.to_dataframe().unstack(level='t')
        
        arr_ts= []
        for b in bands:
            df_b = df[b]
            if train:
                df_b = df_b.dropna(how = 'all')
            if invert:
                # Randomly select 25% of the pixels 
                rows_to_filter = int(0.25 * len(df_b))
                df_b = df_b.sample(n=rows_to_filter, random_state=42)
                
            arr_ts.append(np.array(df_b))

        all_pixels_ts.append(np.stack(arr_ts, axis=1))
        
    
    samples = np.vstack(all_pixels_ts)

    
    return samples

In [93]:
te_pv_samples = s2_data(pv_farms)
te_non_pv_samples = s2_data(pv_farms, invert=True)

np.save(f'{data_dir}/s2/germany/extracted_non_temporal/Austria_daily_te_pv_samples.npy', te_pv_samples)
np.save(f'{data_dir}/s2/germany/extracted_non_temporal/Austria_daily_te_non_pv_samples.npy', te_non_pv_samples)

In [94]:
# Stack the samples [PV, non-PV]
te_samples = np.vstack([te_pv_samples, te_non_pv_samples])
te_samples = te_samples.reshape(te_samples.shape[0], te_samples.shape[1]*te_samples.shape[2])
te_samples = np.nan_to_num(te_samples, nan=-999999)

# Stach the targets
y1 = np.ones(te_pv_samples.shape[0])
y0 = np.zeros(te_non_pv_samples.shape[0])
y_test = np.hstack([y1, y0])

In [95]:
clf = pickle.load(open(f'../models/non_temporal_models/rf_non_temporal_1.sav', 'rb'))
te_pred = clf.predict(te_samples)

test_score = accuracy_score(te_pred, y_test)
print(f'Test Accuracy Score: {test_score}')

Test Accuracy Score: 0.583614979045559


In [97]:
te_pv_samples = np.load(f'{data_dir}/s2/germany/extracted_non_temporal/Austria_daily_te_pv_samples.npy')

te_pv_samples = te_pv_samples.reshape(te_pv_samples.shape[0], te_pv_samples.shape[1]*te_pv_samples.shape[2])
te_pv_samples = np.nan_to_num(te_pv_samples, nan=-999999)

te_pred = clf.predict(te_pv_samples)

test_score = accuracy_score(te_pred, y1)
print(f'Test Accuracy Score for PV pixels: {test_score}')

Test Accuracy Score for PV pixels: 0.3814827706230421
