In [1]:
# geospatial
import geopandas as gpd
import rasterio
from rasterio import features

# base
import pandas as pd
import numpy as np
import os, itertools, time
#from multiprocessing import Pool, cpu_count

# iteration prints 
from IPython.display import clear_output

# plotting
import matplotlib.pyplot as plt

# stats
from scipy import stats

In [2]:
# folder locations
scratch = os.path.join("H:\\", "scratch")
shapefiles = os.path.join("..", "data", "shapefiles")

merged = os.path.join("H:\\", "Merged")
h_mask_loc = os.path.join(merged, "harvest_mask", "harvest_mask.tif")
masked_structure_locs = os.path.join(merged, "masked_structure")

# variables
struct_vars = os.listdir(masked_structure_locs)
all_vars  = struct_vars + ["vlce", "Change_Attribution"]

In [3]:
# keys
structure_key = pd.read_csv(os.path.join("..", "keys", "structure.csv"))

In [4]:
ppa_bec = gpd.read_file(os.path.join(shapefiles, "bc_ppa_bec_hres.shp"))

In [5]:
# some generic cleaning of the gdf
# calculate area for each polygon
ppa_bec["Shap_Ar"] = ppa_bec.area

# assign a protected column to the df
# this lambda thing is like a local function apparently? im not entirely sure whats going on
# but it works
ppa_bec = ppa_bec.assign(protected = lambda dataframe: dataframe["NAME_E"].map(lambda NAME_E: True if NAME_E else False))

# create a unique value for zone/subzones
ppa_bec["szs"] = ppa_bec["zone"] + "_" + ppa_bec["subzone"]

# filter out any without bec zones (generally sliver polygons)
ppa_bec = ppa_bec[ppa_bec["szs"].notnull()]

ppa_bec = ppa_bec[["szs", "geometry", "zone", "subzone", "protected", "Shap_Ar"]]
ppa_bec["raster"] = 1

In [15]:
subzones = sorted(list(set(ppa_bec["szs"])))
subzones.remove("SBPS_mk")

len(subzones)

127

# equal sampling of pixels THEN remove non-forest or remove non-forest then equal sampling of pixels?

In [16]:
def rasterize_polygon_mask(gpd, out_loc, meta):
    # rasterizes a polygon. requires a field in gpd named 'raster'
    # masks out harvested pixels. requires
    with rasterio.open(out_loc, "w+", **meta) as out:
        out_arr = out.read(1)

        shapes = ((geom,value) for geom, value in zip(gpd.geometry, gpd.raster))

        burned = features.rasterize(shapes = shapes, fill = 0, out = out_arr, transform = out.transform)

        # removes pixels impacted by harvest
        burned = burned * h_mask_data

        out.write_band(1, burned)
        
        return burned

def sample_raster(mask, save_loc, big, small):
    
    sampled_loc = save_loc[:-4] + "-sampled" + save_loc[-4:]
    
    # find valid indexes using mask, then randomly select from them until
    # an equal number to the smaller (p/np) proportion of the bec zone
    # is found
    indexes = np.where(mask == 1)
    good_indexes = np.random.choice(np.arange(0, big), size = small, replace = False)
    
    x = indexes[0][good_indexes]
    y = indexes[1][good_indexes]

    # generate blank raster, and make new mask based on this by taking
    # xy coords and changing them to 1
    blank = np.zeros(mask.shape, dtype = "uint8")
    blank[x, y] = 1

    # save with the sampled suffix, so it can be used without 
    # reprocessing
    with rasterio.open(sampled_loc, "w+", **meta) as out:

        out.write_band(1, blank)
    
    return blank

def equal_sample_p_np(soi):
    np.random.seed(69420)
    
    # filter ppa for the subzone of interest
    subzone_filter = ppa_bec[ppa_bec["szs"] == soi]
    
    # split into protected/non protected shapefiles and save them (not sure its necessary to save)
    # selected protected shapes
    # removes invalid geometries
    p_sub = subzone_filter[subzone_filter["protected"] == True]
    p_sub_clean = p_sub[(p_sub.geometry.type == "Polygon") | (p_sub.geometry.type == "MultiPolygon")]
    p_sub_clean = p_sub_clean.dissolve(by = "szs")
    #p_sub.to_file(os.path.join(scratch, "p-" + soi + ".shp"))

    # select unprotected shapes
    np_sub = subzone_filter[subzone_filter["protected"] == False]
    # removes invalid geometries
    np_sub_clean = np_sub[(np_sub.geometry.type == "Polygon") | (np_sub.geometry.type == "MultiPolygon")]
    np_sub_clean = np_sub_clean.dissolve(by = "szs")
    #np_sub.to_file(os.path.join(scratch, "np-" + soi + ".shp"))
    
    # save locations
    p_raster_loc = os.path.join(scratch, "p-" + soi + ".tif")
    np_raster_loc = os.path.join(scratch, "np-" + soi + ".tif")
    
    # generate masks from the polygons
    p_mask = rasterize_polygon_mask(p_sub_clean, p_raster_loc, meta)
    np_mask = rasterize_polygon_mask(np_sub_clean, np_raster_loc, meta)
    
    # this is where the forest mask would come in
    # unless i do it in R?

    p_pixels = np.sum(p_mask)
    np_pixels = np.sum(np_mask) 
    
    if p_pixels < np_pixels:
        np_sampled = sample_raster(np_mask, np_raster_loc, np_pixels, p_pixels)
        p_sampled = p_mask
        
        unsampled_loc = p_raster_loc[:-4] + "-sampled" + p_raster_loc[-4:]
        
        with rasterio.open(unsampled_loc, "w+", **meta) as out:
            out.write_band(1, p_sampled)
    
    else:
        p_sampled = sample_raster(p_mask, p_raster_loc, p_pixels, np_pixels)
        np_sampled = np_mask
        
        unsampled_loc = np_raster_loc[:-4] + "-sampled" + np_raster_loc[-4:]
        
        with rasterio.open(unsampled_loc, "w+", **meta) as out:
            out.write_band(1, np_sampled)
            
    return p_sampled, np_sampled

def load_data(variable, year):
    if variable in struct_vars:
        file_loc = os.path.join(masked_structure_locs, variable, "masked-" + variable + "-" + str(year) + ".tif")
        
    elif variable == "Change_Attribution":
        file_loc = os.path.join(merged, variable, "BC-" + variable + ".tif")
        
    elif variable == "vlce":
        file_loc = os.path.join(merged, variable, "BC-" + variable + "-" + str(year) + ".tif")
        
    else:
        print("Not a valid variable")
        return

    with rasterio.open(file_loc) as rst:
        rst_data = rst.read(1)
        
    return rst_data

def get_data(mask, variable, year):
    
    if variable in struct_vars:
        divisor = structure_key[structure_key["variable"] == variable].iloc[0]["divide_by"]
        
    indexes = np.where(mask == 1)
    x = indexes[0]
    y = indexes[1]

    data_arr = rst_data[x, y]
    
    if variable in struct_vars:
        data_arr = data_arr[np.where(data_arr != 0)]
        return data_arr / divisor
    else:
        return data_arr
    
def generate_df(soi, variable, year):
    # checks if masks exist for both protected and non protected
    # if it exists, don't generate new ones
    mask_locs = [os.path.join(scratch, protected + "-" + soi + "-sampled" + ".tif") for protected in ["p", "np"]]
    mask_sum = sum([os.path.isfile(loc) for loc in mask_locs])

    if mask_sum == 2:
        masks = []

        for i in range(len(mask_locs)):
            with rasterio.open(mask_locs[i]) as rst:
                masks.append(rst.read(1))
    else:
        masks = equal_sample_p_np(soi)
    
    p_data = get_data(masks[0], variable, year)
    np_data = get_data(masks[1], variable, year)
    
    if variable in struct_vars:
        df1 = pd.DataFrame(p_data, columns = ["value"])
        
        df2 = pd.DataFrame(np_data, columns = ["value"])
    
    elif variable in ["vlce", "Change_Attribution"]:
    
        counts = np.transpose(np.unique(p_data, return_counts = True))

        df1 = pd.DataFrame(counts, columns = ["class_val", "cells"])
        
        counts = np.transpose(np.unique(np_data, return_counts = True))
    
        df2 = pd.DataFrame(counts, columns = ["class_val", "cells"])
    else:
        print("Not a valid variable")
        return
        
        
    df1["protected"] = True    
    df2["protected"] = False
    
    df = pd.concat([df1, df2])
    
    df["subzone"] = soi
    df["year"] = year
    df["variable"] = variable

    return df

1 min per variable-year-soi
5 variables
128 soi
1 year
10 hours

35 years
15 days


should i be including topographic variables?

In [None]:
year = 2015

h_mask = rasterio.open(h_mask_loc)
h_mask_data = h_mask.read(1)
meta = h_mask.meta.copy()
meta["nodata"] = 0

#sampled_rasters = equal_sample_p_np(soi)
start_time = time.time()

vlce_dfs = []
change_dfs = []

num_total = len(subzones) * len(all_vars)
num_done = 0
for variable in all_vars:
    
    rst_data = load_data(variable, year)
    
    for soi in subzones:
        
        save_name = soi + "-" + variable + "-" + str(year) + ".csv"
        save_loc = os.path.join("..", "data", "structure", save_name)
            
        start_time = time.time()
        
        #clear_output(wait = True)
        print(soi, variable, year)
        print(num_done + 1, "/", num_total)
        
        if not os.path.isfile(save_loc):
        
            df = generate_df(soi, variable, year)

            if variable in struct_vars:

                df.to_csv(save_loc, index = False)

            elif variable == "vlce":
                vlce_dfs.append(df)

            elif variable == "Change_Attribution":
                change_dfs.append(df)
            else:
                print("not a valid variable")
        
        num_done += 1
        
        end_time = time.time()
        
        print(end_time - start_time)
        
del h_mask
del h_mask_data

pd.concat(vlce_dfs).to_csv(os.path.join("..", "data", "vlce.csv"))
pd.concat(change_dfs).to_csv(os.path.join("..", "data", "change_attribution.csv"))

BAFA_un loreys_height 2015
1 / 635
47.26367807388306
BAFA_unp loreys_height 2015
2 / 635
34.97203493118286
BG_xh loreys_height 2015
3 / 635
0.0009999275207519531
BG_xw loreys_height 2015
4 / 635
0.0009987354278564453
BWBS_dk loreys_height 2015
5 / 635
0.002998828887939453
BWBS_mk loreys_height 2015
6 / 635
0.001998424530029297
BWBS_mw loreys_height 2015
7 / 635
0.0019979476928710938
BWBS_vk loreys_height 2015
8 / 635
0.001997232437133789
BWBS_wk loreys_height 2015
9 / 635
0.002001047134399414
CDF_mm loreys_height 2015
10 / 635
0.0009984970092773438
CMA_un loreys_height 2015
11 / 635
0.001998424530029297
CMA_unp loreys_height 2015
12 / 635
0.0009984970092773438
CMA_wh loreys_height 2015
13 / 635
0.0009989738464355469
CWH_dm loreys_height 2015
14 / 635
0.0009989738464355469
CWH_ds loreys_height 2015
15 / 635
0.0010004043579101562
CWH_mm loreys_height 2015
16 / 635
0.0019989013671875
CWH_ms loreys_height 2015
17 / 635
0.0009989738464355469
CWH_vh loreys_height 2015
18 / 635
0.001000165939

In [None]:
h_mask = rasterio.open(h_mask_loc)
h_mask_data = h_mask.read(1)
meta = h_mask.meta.copy()
meta["nodata"] = 0

for i in range(len(subzones)):
    clear_output(wait = True)
    if i > 0:
        print("time for " + subzones[i - 1], end_time - start_time)
    print(subzones[i])
    print(i + 1, "/", len(subzones))
    
    start_time = time.time()
    equal_sample_p_np(subzones[i])
    end_time = time.time()
    
del h_mask
del h_mask_data