In [2]:
import xarray as xr
import numpy as np
import pandas as pd
import copy
from glob import glob
from shapely.geometry import box
import geopandas as gpd
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

#sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [7]:
data_dir = '/mnt/CEPH_PROJECTS/sao/openEO_Platform'

In [8]:
big_pv_geoms = gpd.read_file(f"{data_dir}/germany_photovoltaic.shp")
print(f"Number of selected PV Farms: {len(big_pv_geoms)}")

Number of selected PV Farms: 4458


In [9]:
bands = ['B02','B03','B04',
         'B05','B06','B07','B08',
         'B8A', 'B11', 'B12']

In [10]:
def temp_s2_data(pv_farms, invert=False):

    all_pixels_ts = []
    for pv_farm in tqdm(pv_farms):
        farm_id = int(pv_farm.split('/')[-1].split('_')[-1].split('.')[0])
        data = xr.open_dataset(f"{data_dir}/s2/germany/s2_2022_{farm_id}.nc",decode_coords="all")
        crs = data.rio.crs
        geom = big_pv_geoms.iloc[farm_id]

         # Filter clouds pixels
        cloud_mask = np.bitwise_or(np.bitwise_or((data.SCL == 8),(data.SCL == 9)),(data.SCL == 3))
        data = data.where(~cloud_mask)  
        data = data.drop_vars(["SCL","B01","B09"])
        # Weekly stats
        ds_weekly = data.median(dim="t").to_dataarray()

        # Crop the farm
        geodf = gpd.GeoDataFrame(geometry=[geom["geometry"]],crs=4326).to_crs(crs)
        ds_weekly.rio.write_crs(crs, inplace=True)
        clipped = ds_weekly.rio.clip(geodf.geometry.values, geodf.crs, drop=False, invert=invert)    

        ds_np = clipped.transpose("y","x","variable").values.reshape(len(clipped["y"])*len(clipped["x"]),len(clipped["variable"]))
        
        all_pixels_ts.append(ds_np)
    
    samples = np.vstack(all_pixels_ts)
    
    return samples

In [11]:
# Extract S2 data for the photvoltaic farms - pixels inside the polygons "labels as 1"
pv_farms = glob(f"{data_dir}/s2/germany/*.nc")
pv_samples = temp_s2_data(pv_farms)
np.save(f'{data_dir}/s2/germany/extracted_temporal_data/pv_samples_t_median.npy', pv_samples)

100%|███████████████████████████████████████| 3687/3687 [09:01<00:00,  6.80it/s]


In [12]:
non_pv_samples = temp_s2_data(pv_farms,invert=True) 
np.save(f'{data_dir}/s2/germany/extracted_temporal_data/non_pv_samples_t_median.npy', non_pv_samples)

100%|███████████████████████████████████████| 3687/3687 [06:24<00:00,  9.58it/s]


#### Combine both PV and Non-PV togehter 

In [10]:
pv_samples = np.load(f'{data_dir}/s2/germany/extracted_temporal_data/pv_samples_t_median.npy')
non_pv_samples = np.load(f'{data_dir}/s2/germany/extracted_temporal_data/non_pv_samples_t_median.npy')

In [11]:
pv_samples.shape

(4445865, 10)

In [15]:
y1 = np.ones(pv_samples.shape[0])
y0 = np.zeros(non_pv_samples.shape[0])

In [16]:
tr_data = np.vstack([pv_samples, non_pv_samples])
y = np.hstack([y1, y0])

In [17]:
print(tr_data.shape)
print(y.shape)

(8891730, 10)
(8891730,)


### RF training

In [18]:
tr_data = np.nan_to_num(tr_data, nan=-999999)

In [26]:
skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=2024) # for cross validation

# Creating loop for the stratified k fold
scores = []
i = 0
for train, val in skf.split(tr_data, y):
    print(f'########### Fold number {i+1} ')

    # spliting the data
    x_train, x_val = tr_data[train], tr_data[val]
    y_train, y_val = y[train], y[val]
    
    clf = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=2024)
    clf.fit(x_train, y_train)
    
    
    # Validation
    y_pred = clf.predict(x_val)
    score = accuracy_score(y_pred, y_val)
    scores.append(score)
    print(f'Accuracy Score: {score}')
    
    filename = f'../models/temporal_models/rf_{i+1}_median_depth_15.sav'
    pickle.dump(clf, open(filename, 'wb'))

    i+=1
    break
print(f'Average F1 Score: {np.mean(scores)}')

########### Fold number 1 
Accuracy Score: 0.7634566051825685
Average F1 Score: 0.7634566051825685
