In [2]:
# geospatial
import geopandas as gpd
import rasterio
from rasterio import features

# base
import pandas as pd
import numpy as np
import os, itertools, time
#from multiprocessing import Pool, cpu_count

# iteration prints
from IPython.display import clear_output

# plotting
import matplotlib.pyplot as plt

# stats
from scipy import stats

In [3]:
# folder locations
scratch = os.path.join("H:\\", "scratch")
shapefiles = os.path.join("..", "data", "shapefiles")

merged = os.path.join("H:\\", "Merged")
h_mask_loc = os.path.join(merged, "harvest_mask", "harvest_mask.tif")
masked_structure_locs = os.path.join(merged, "masked_structure")
bec_mask_loc = os.path.join(merged, "bec_masks")

# variables
structure_vars = os.listdir(masked_structure_locs)
elev_vars = ["elev", "slope"]
all_vars  = elev_vars + structure_vars + ["vlce", "Change_Attribution"]

In [4]:
all_vars

['elev',
 'slope',
 'elev_cv',
 'loreys_height',
 'percentage_first_returns_above_2m',
 'total_biomass',
 'vlce',
 'Change_Attribution']

In [5]:
structure_key = pd.read_csv(os.path.join("..", "keys", "continuous.csv"))

In [6]:
ppa_bec = gpd.read_file(os.path.join(shapefiles, "bc_ppa_bec_hres.shp"))

In [7]:
# some generic cleaning of the gdf
# calculate area for each polygon
ppa_bec["Shap_Ar"] = ppa_bec.area

# assign a protected column to the df
# this lambda thing is like a local function apparently? im not entirely sure whats going on
# but it works
ppa_bec = ppa_bec.assign(protected = lambda dataframe: dataframe["NAME_E"].map(lambda NAME_E: True if NAME_E else False))

# create a unique value for zone/subzones
ppa_bec["szs"] = ppa_bec["zone"] + "_" + ppa_bec["subzone"]

# filter out any without bec zones (generally sliver polygons)
ppa_bec = ppa_bec[ppa_bec["szs"].notnull()]

ppa_bec = ppa_bec[["szs", "geometry", "zone", "subzone", "protected", "Shap_Ar"]]
ppa_bec["raster"] = 1

In [8]:
subzones = sorted(list(set(ppa_bec["szs"])))
subzones.remove("SBPS_mk") # doesnt have a protected area, can't compare
# but useful for some other stuff so processed indiviudally later

In [12]:
def rasterize_polygon_mask(gpd, out_loc, meta):
    # rasterizes a polygon. requires a field in gpd named 'raster'
    # masks out harvested pixels. requires
    
    with rasterio.open(out_loc, "w+", **meta) as out:
        out_arr = out.read(1)

        shapes = ((geom,value) for geom, value in zip(gpd.geometry, gpd.raster))

        burned = features.rasterize(shapes = shapes, fill = 0, out = out_arr, transform = out.transform)

        # remove this to prevent harvest from being masked out
        # burned = burned * h_mask_data
        
        out.write_band(1, burned)
        
        return burned

def sample_raster(mask, save_loc, big, small):
    
    sampled_loc = save_loc[:-4] + "-sampled" + save_loc[-4:]
    
    sampled_loc = os.path.split(sampled_loc)[1]
    
    sampled_loc = os.path.join(bec_mask_loc, sampled_loc)
    
    
    # find valid indexes using mask, then randomly select from them until
    # an equal number to the smaller (p/np) proportion of the bec zone
    # is found
    indexes = np.where(mask == 1)
    good_indexes = np.random.choice(np.arange(0, big), size = small, replace = False)
    
    x = indexes[0][good_indexes]
    y = indexes[1][good_indexes]

    # generate blank raster, and make new mask based on this by taking
    # xy coords and changing them to 1
    blank = np.zeros(mask.shape, dtype = "uint8")
    blank[x, y] = 1

    # save with the sampled suffix, so it can be used without 
    # reprocessing
    with rasterio.open(sampled_loc, "w+", **meta) as out:

        out.write_band(1, blank)
    
    return blank

def equal_sample_p_np(soi):
    np.random.seed(69420)
    
    # filter ppa for the subzone of interest
    subzone_filter = ppa_bec[ppa_bec["szs"] == soi]
    
    # split into protected/non protected shapefiles and save them (not sure its necessary to save)
    # selected protected shapes
    # removes invalid geometries
    p_sub = subzone_filter[subzone_filter["protected"] == True]
    p_sub_clean = p_sub[(p_sub.geometry.type == "Polygon") | (p_sub.geometry.type == "MultiPolygon")]
    p_sub_clean = p_sub_clean.dissolve(by = "szs")

    # select unprotected shapes
    np_sub = subzone_filter[subzone_filter["protected"] == False]
    # removes invalid geometries
    np_sub_clean = np_sub[(np_sub.geometry.type == "Polygon") | (np_sub.geometry.type == "MultiPolygon")]
    np_sub_clean = np_sub_clean.dissolve(by = "szs")
    
    # save locations
    p_raster_loc = os.path.join(scratch, "p-" + soi + ".tif")
    np_raster_loc = os.path.join(scratch, "np-" + soi + ".tif")
    
    # generate masks from the polygons
    p_mask = rasterize_polygon_mask(p_sub_clean, p_raster_loc, meta)
    np_mask = rasterize_polygon_mask(np_sub_clean, np_raster_loc, meta)
    
    # this is where the forest mask would come in
    # unless i do it in R?

    p_pixels = np.sum(p_mask)
    np_pixels = np.sum(np_mask) 
    
    if p_pixels < np_pixels:
        np_sampled = sample_raster(np_mask, np_raster_loc, np_pixels, p_pixels)
        p_sampled = p_mask
        
        unsampled_loc = os.path.join(bec_mask_loc, "p-" + soi + "-sampled.tif")
        
        with rasterio.open(unsampled_loc, "w+", **meta) as out:
            out.write_band(1, p_sampled)
    
    else:
        p_sampled = sample_raster(p_mask, p_raster_loc, p_pixels, np_pixels)
        np_sampled = np_mask
        
        unsampled_loc = os.path.join(bec_mask_loc, "np-" + soi + "-sampled.tif")
        
        with rasterio.open(unsampled_loc, "w+", **meta) as out:
            out.write_band(1, np_sampled)
            
    return p_sampled, np_sampled

def load_data(variable, year):
    if variable in structure_vars:
        file_loc = os.path.join(masked_structure_locs, variable, "masked-" + variable + "-" + str(year) + ".tif")
        
    elif variable == "Change_Attribution" or variable == "Greatest_Change_Year":
        file_loc = os.path.join(merged, variable, "BC-" + variable + ".tif") 
        
    elif variable == "vlce":
        file_loc = os.path.join(merged, variable, "BC-" + variable + "-" + str(year) + ".tif")
        
    elif variable in elev_vars:
        file_loc = os.path.join(merged, "elevation", "BC-" + variable + ".tif")
        
    else:
        print("Not a valid variable")
        return

    with rasterio.open(file_loc) as rst:
        rst_data = rst.read(1)
        
    return rst_data

def get_data(mask, variable, year):
    
    if variable in structure_vars:
        divisor = structure_key[structure_key["variable"] == variable].iloc[0]["divide_by"]
        
    indexes = np.where(mask == 1)
    x = indexes[0]
    y = indexes[1]
    
    data_arr = rst_data[x, y]
    
    if variable in structure_vars:
        # data_arr = data_arr[np.where(data_arr != 0)]
        return data_arr / divisor
    
    elif variable == "Change_Attribution":
        data_year = change_year[x, y]
        
        return data_arr, data_year
    
    else:
        return data_arr
    
def generate_df(soi, variable, year):
    # checks if masks exist for both protected and non protected
    # if it exists, don't generate new ones
    mask_locs = [os.path.join(bec_mask_loc, protected + "-" + soi + "-sampled" + ".tif") for protected in ["p", "np"]]
    mask_sum = sum([os.path.isfile(loc) for loc in mask_locs])

    if mask_sum == 2:
        masks = []

        for i in range(len(mask_locs)):
            with rasterio.open(mask_locs[i]) as rst:
                masks.append(rst.read(1))
    else:
        masks = equal_sample_p_np(soi)
    
    p_data = get_data(masks[0], variable, year)
    np_data = get_data(masks[1], variable, year)
    
    if variable in structure_vars or variable in elev_vars:
        df1 = pd.DataFrame(p_data, columns = ["value"])
        
        df2 = pd.DataFrame(np_data, columns = ["value"])
    
    elif variable == "vlce":
    
        counts = np.transpose(np.unique(p_data, return_counts = True))

        df1 = pd.DataFrame(counts, columns = ["class_val", "cells"])
        
        counts = np.transpose(np.unique(np_data, return_counts = True))
    
        df2 = pd.DataFrame(counts, columns = ["class_val", "cells"])
        
    elif variable == "Change_Attribution":
        df1 = pd.DataFrame(np.transpose(p_data), columns = ["class_val", "year_disturbed"])
        
        df1["n"] = 1
        df1 = df1.groupby(["class_val", "year_disturbed"]).count()[["n"]]
        
        df2 = pd.DataFrame(np.transpose(np_data), columns = ["class_val", "year_disturbed"])
        
        df2["n"] = 1
        df2 = df2.groupby(["class_val", "year_disturbed"]).count()[["n"]]
                           
    else:
        print("Not a valid variable")
        return
        
        
    df1["protected"] = True    
    df2["protected"] = False
    
    df = pd.concat([df1, df2])
    
    df["subzone"] = soi
    df["variable"] = variable
    
    if variable != "Change_Attribution":
        df["year"] = year

    return df

1 min per variable-year-soi
5 variables
128 soi
1 year
10 hours

35 years
15 days


should i be including topographic variables?

In [9]:
year = 2015

h_mask = rasterio.open(h_mask_loc)
h_mask_data = h_mask.read(1)
meta = h_mask.meta.copy()
meta["nodata"] = 0

start_time = time.time()

iter_subzones = subzones

num_total = len(iter_subzones) * len(all_vars)
num_done = 0

for variable in all_vars:
    clear_output(wait = True)
    print("loading", variable)
    load_start = time.time()
    
    rst_data = load_data(variable, year)      
    
    if variable == "Change_Attribution":
        change_year = load_data("Greatest_Change_Year", year)
        
    load_end = time.time()
    
    print("loading", variable, "took", load_end - load_start)
    
    for soi in iter_subzones:
        
        if variable in structure_vars:
            save_name = soi + "-" + variable + "-" + str(year) + ".csv"
            save_loc = os.path.join("..", "data", "structure", save_name)
        
        elif variable == "Change_Attribution":
            save_name = soi + "-" + variable + ".csv"
            save_loc = os.path.join("..", "data", "disturbance", save_name)
            
        elif variable == "vlce":
            save_name = soi + "-" + variable + "-" + str(year) + ".csv"
            save_loc = os.path.join("..", "data", "vlce", save_name)
            
        elif variable in elev_vars:
            save_name = soi + "-" + variable + ".csv"
            save_loc = os.path.join("..", "data", "elev", save_name)
            
        start_time = time.time()
        
        print(soi, variable, year)
        print(num_done + 1, "/", num_total)
        
        if not os.path.isfile(save_loc):
        
            df = generate_df(soi, variable, year)
            
            if variable == "Change_Attribution":

                df.to_csv(save_loc)

            else:

                df.to_csv(save_loc, index = False)            
                
        
        num_done += 1
        
        end_time = time.time()
        
        print(end_time - start_time)
    
    if 'change_year' in globals():
        del change_year
        
del h_mask
del h_mask_data

loading Change_Attribution
loading Change_Attribution took 55.01481485366821
BAFA_un Change_Attribution 2015
763 / 889
0.0010094642639160156
BAFA_unp Change_Attribution 2015
764 / 889
0.0020046234130859375
BG_xh Change_Attribution 2015
765 / 889
0.0009834766387939453
BG_xw Change_Attribution 2015
766 / 889
0.0009989738464355469
BWBS_dk Change_Attribution 2015
767 / 889
0.0009989738464355469
BWBS_mk Change_Attribution 2015
768 / 889
0.0019991397857666016
BWBS_mw Change_Attribution 2015
769 / 889
0.00099945068359375
BWBS_vk Change_Attribution 2015
770 / 889
0.0010004043579101562
BWBS_wk Change_Attribution 2015
771 / 889
0.00099945068359375
CDF_mm Change_Attribution 2015
772 / 889
0.00099945068359375
CMA_un Change_Attribution 2015
773 / 889
0.0009999275207519531
CMA_unp Change_Attribution 2015
774 / 889
0.000997781753540039
CMA_wh Change_Attribution 2015
775 / 889
0.0009996891021728516
CWH_dm Change_Attribution 2015
776 / 889
0.0009992122650146484
CWH_ds Change_Attribution 2015
777 / 889


# New df method

In [17]:
def load_data(variable, year):
    if variable in structure_vars:
        file_loc = os.path.join(merged, variable, "BC-" + variable + "-" + str(year) + ".tif")
        
    elif variable == "Change_Attribution" or variable == "Greatest_Change_Year":
        file_loc = os.path.join(merged, variable, "BC-" + variable + ".tif") 
        
    elif variable == "vlce":
        file_loc = os.path.join(merged, variable, "BC-" + variable + "-" + str(year) + ".tif")
        
    elif variable in elev_vars:
        file_loc = os.path.join(merged, "elevation", "BC-" + variable + ".tif")
        
    else:
        print("Not a valid variable")
        return
    
    print(file_loc)
    
    with rasterio.open(file_loc) as rst:
        rst_data = rst.read(1)
        
    return rst_data

def get_data(mask, in_data, year):
        
    indexes = np.where(mask == 1)
    
    data_arr = in_data[indexes[0], indexes[1]]
    
    if variable in structure_vars:
        divisor = structure_key[structure_key["variable"] == variable].iloc[0]["divide_by"]
        #print(divisor)
        data_arr = data_arr / divisor
    
    return data_arr

This is where all the data is loaded in, it kind of works as a raster stack that is then transformed into a DF

In [15]:
year = 2015

arrays = {}
start = time.time()
for variable in all_vars:
    mid = time.time()
    arrays[variable] = load_data(variable, year)
    print(variable, time.time() - mid)

change_year = load_data("Greatest_Change_Year", year)

arrays["change_year"] = change_year
del change_year
end = time.time()
print(end - start)

elev 55.374565839767456
slope 66.64747047424316
elev_cv 26.845725536346436
loreys_height 47.69898772239685
percentage_first_returns_above_2m 33.38028168678284
total_biomass 53.72409772872925
vlce 14.576386451721191
Change_Attribution 13.53196096420288
340.7567434310913


In [18]:
# for the single subzone with no protected area. needs to be included in the BEC proportions figure
# only one or i would solve in functions

soi = "SBPS_mk"
year = 2015

h_mask = rasterio.open(h_mask_loc)
h_mask_data = h_mask.read(1)
meta = h_mask.meta.copy()
meta["nodata"] = 0
del h_mask
del h_mask_data

start = time.time()
print(soi, year)
csv_save = soi + "-" + str(year) + ".csv"
save_location = os.path.join("..", "data", "all_vars", csv_save)

if not os.path.isfile(save_location):
    
    np.random.seed(69420)
    
    # filter ppa for the subzone of interest
    subzone_filter = ppa_bec[ppa_bec["szs"] == soi]
    
    # select unprotected shapes
    np_sub = subzone_filter[subzone_filter["protected"] == False]
    # removes invalid geometries
    np_sub_clean = np_sub[(np_sub.geometry.type == "Polygon") | (np_sub.geometry.type == "MultiPolygon")]
    np_sub_clean = np_sub_clean.dissolve(by = "szs")
    
    # save locations
    np_raster_loc = os.path.join(scratch, "np-" + soi + ".tif")
    
    np_mask = rasterize_polygon_mask(np_sub_clean, np_raster_loc, meta)

    print("making data arrays")
    #protected = []
    unprotected = []
    for key in arrays:
        variable = key
        #protected.append(get_data(masks[0], arrays[key], year))
        unprotected.append(get_data(np_mask, arrays[key], year))

    #df_p = pd.DataFrame(np.transpose(np.vstack(protected)), columns = all_vars + ["change_year"])
    #df_p["protected"] = "protected"

    df_np = pd.DataFrame(np.transpose(np.vstack(unprotected)), columns = all_vars + ["change_year"])
    df_np["protected"] = "unprotected"

    print("merging and saving")

    df = df_np
    df["subzone"] = soi
    df["year"] = year

    df.to_csv(save_location, index = False)
    print(time.time() - start)
        
del arrays

SBPS_mk 2015
making data arrays
merging and saving
152.57558822631836


In [10]:
# using the whole sample from the masks, so not an equal sample when put into a csv

for soi in subzones:
    start = time.time()
    print(soi, year)
    csv_save = soi + "-" + str(year) + ".csv"
    save_location = os.path.join("..", "data", "all_vars", csv_save)
    
    if not os.path.isfile(save_location):

        print("loading masks")
        mask_locs = [os.path.join(scratch, protected + "-" + soi + ".tif") for protected in ["p", "np"]]
        mask_sum = sum([os.path.isfile(loc) for loc in mask_locs])

        if mask_sum == 2:
            masks = []

            for i in range(len(mask_locs)):
                with rasterio.open(mask_locs[i]) as rst:
                    masks.append(rst.read(1))
                    
        else:
            print("loading failed")
            print("generating new masks")
            masks = equal_sample_p_np(soi)
        
        print("making data arrays")
        protected = []
        unprotected = []
        for key in arrays:
            variable = key
            protected.append(get_data(masks[0], arrays[key], year))
            unprotected.append(get_data(masks[1], arrays[key], year))

        df_p = pd.DataFrame(np.transpose(np.vstack(protected)), columns = all_vars + ["change_year"])
        df_p["protected"] = "protected"

        df_np = pd.DataFrame(np.transpose(np.vstack(unprotected)), columns = all_vars + ["change_year"])
        df_np["protected"] = "unprotected"
        
        print("merging and saving")

        df = pd.concat([df_p, df_np])
        df["subzone"] = soi
        df["year"] = year

        df.to_csv(save_location, index = False)
        print(time.time() - start)
        
del arrays

BAFA_un 2015
loading masks
making data arrays
merging and saving
974.8207929134369
BAFA_unp 2015
loading masks
making data arrays
merging and saving
214.36320424079895
BG_xh 2015
loading masks
making data arrays
merging and saving
214.95569825172424
BG_xw 2015
loading masks
making data arrays
merging and saving
201.5443480014801
BWBS_dk 2015
loading masks
making data arrays
merging and saving
530.8681397438049
BWBS_mk 2015
loading masks
making data arrays
merging and saving
1305.3336462974548
BWBS_mw 2015
loading masks
making data arrays
merging and saving
532.1236691474915
BWBS_vk 2015
loading masks
making data arrays
merging and saving
193.88935112953186
BWBS_wk 2015
loading masks
making data arrays
merging and saving
285.0574655532837
CDF_mm 2015
loading masks
making data arrays
merging and saving
219.58830499649048
CMA_un 2015
loading masks
making data arrays
merging and saving
374.97145438194275
CMA_unp 2015
loading masks
making data arrays
merging and saving
448.14917397499084
CM