In [1]:
# geospatial
import geopandas as gpd
import rasterio
from rasterio import features

# base
import pandas as pd
import numpy as np
import os, itertools, time
#from multiprocessing import Pool, cpu_count

# iteration prints
from IPython.display import clear_output

In [2]:
# folder locations
scratch = os.path.join("D:\\", "scratch")
shapefiles = os.path.join("..", "data", "shapefiles")

merged = os.path.join("D:\\", "Merged")
h_mask_loc = os.path.join(merged, "harvest_mask", "harvest_mask.tif")
masked_structure_locs = os.path.join(merged, "masked_structure")
bec_mask_loc = os.path.join(merged, "bec_masks")

# variables
structure_vars = os.listdir(masked_structure_locs)
elev_vars = ["elev", "slope"]
all_vars  = elev_vars + structure_vars + ["vlce", "Change_Attribution"]
all_vars

['elev',
 'slope',
 'elev_cv',
 'loreys_height',
 'percentage_first_returns_above_2m',
 'total_biomass',
 'vlce',
 'Change_Attribution']

In [3]:
structure_key = pd.read_csv(os.path.join("..", "keys", "continuous.csv"))

In [4]:
ppa_bec = gpd.read_file(os.path.join(shapefiles, "bc_ppa_bec_hres.shp"))

In [5]:
# some generic cleaning of the gdf
# calculate area for each polygon
ppa_bec["Shap_Ar"] = ppa_bec.area

# assign a protected column to the df
# this lambda thing is like a local function apparently? im not entirely sure whats going on
# but it works
ppa_bec = ppa_bec.assign(protected = lambda dataframe: dataframe["NAME_E"].map(lambda NAME_E: True if NAME_E else False))

# create a unique value for zone/subzones
ppa_bec["szs"] = ppa_bec["zone"] + "_" + ppa_bec["subzone"]

# filter out any without bec zones (generally sliver polygons)
ppa_bec = ppa_bec[ppa_bec["szs"].notnull()]

ppa_bec = ppa_bec[["szs", "geometry", "zone", "subzone", "protected", "Shap_Ar"]]
ppa_bec["raster"] = 1

In [6]:
subzones = sorted(list(set(ppa_bec["szs"])))
subzones.remove("SBPS_mk") # doesnt have a protected area, can't compare
# but useful for some other stuff so processed indiviudally later

In [7]:
def rasterize_polygon_mask(gpd, out_loc, meta):
    # rasterizes a polygon. requires a field in gpd named 'raster'
    # masks out harvested pixels. requires
    
    with rasterio.open(out_loc, "w+", **meta) as out:
        out_arr = out.read(1)

        shapes = ((geom,value) for geom, value in zip(gpd.geometry, gpd.raster))

        burned = features.rasterize(shapes = shapes, fill = 0, out = out_arr, transform = out.transform)

        # remove this to prevent harvest from being masked out
        # burned = burned * h_mask_data
        
        out.write_band(1, burned)
        
        return burned

def sample_raster(mask, save_loc, big, small):
    
    sampled_loc = save_loc[:-4] + "-sampled" + save_loc[-4:]
    
    sampled_loc = os.path.split(sampled_loc)[1]
    
    sampled_loc = os.path.join(bec_mask_loc, sampled_loc)
    
    
    # find valid indexes using mask, then randomly select from them until
    # an equal number to the smaller (p/np) proportion of the bec zone
    # is found
    indexes = np.where(mask == 1)
    good_indexes = np.random.choice(np.arange(0, big), size = small, replace = False)
    
    x = indexes[0][good_indexes]
    y = indexes[1][good_indexes]

    # generate blank raster, and make new mask based on this by taking
    # xy coords and changing them to 1
    blank = np.zeros(mask.shape, dtype = "uint8")
    blank[x, y] = 1

    # save with the sampled suffix, so it can be used without 
    # reprocessing
    with rasterio.open(sampled_loc, "w+", **meta) as out:

        out.write_band(1, blank)
    
    return blank

def equal_sample_p_np(soi):
    np.random.seed(69420)
    
    # filter ppa for the subzone of interest
    subzone_filter = ppa_bec[ppa_bec["szs"] == soi]
    
    # split into protected/non protected shapefiles and save them (not sure its necessary to save)
    # selected protected shapes
    # removes invalid geometries
    p_sub = subzone_filter[subzone_filter["protected"] == True]
    p_sub_clean = p_sub[(p_sub.geometry.type == "Polygon") | (p_sub.geometry.type == "MultiPolygon")]
    p_sub_clean = p_sub_clean.dissolve(by = "szs")

    # select unprotected shapes
    np_sub = subzone_filter[subzone_filter["protected"] == False]
    # removes invalid geometries
    np_sub_clean = np_sub[(np_sub.geometry.type == "Polygon") | (np_sub.geometry.type == "MultiPolygon")]
    np_sub_clean = np_sub_clean.dissolve(by = "szs")
    
    # save locations
    p_raster_loc = os.path.join(scratch, "p-" + soi + ".tif")
    np_raster_loc = os.path.join(scratch, "np-" + soi + ".tif")
    
    # generate masks from the polygons
    p_mask = rasterize_polygon_mask(p_sub_clean, p_raster_loc, meta)
    np_mask = rasterize_polygon_mask(np_sub_clean, np_raster_loc, meta)
    
    # this is where the forest mask would come in
    # unless i do it in R?

    p_pixels = np.sum(p_mask)
    np_pixels = np.sum(np_mask) 
    
    if p_pixels < np_pixels:
        np_sampled = sample_raster(np_mask, np_raster_loc, np_pixels, p_pixels)
        p_sampled = p_mask
        
        unsampled_loc = os.path.join(bec_mask_loc, "p-" + soi + "-sampled.tif")
        
        with rasterio.open(unsampled_loc, "w+", **meta) as out:
            out.write_band(1, p_sampled)
    
    else:
        p_sampled = sample_raster(p_mask, p_raster_loc, p_pixels, np_pixels)
        np_sampled = np_mask
        
        unsampled_loc = os.path.join(bec_mask_loc, "np-" + soi + "-sampled.tif")
        
        with rasterio.open(unsampled_loc, "w+", **meta) as out:
            out.write_band(1, np_sampled)
            
    return p_sampled, np_sampled

def load_data(variable, year):
    if variable in structure_vars:
        file_loc = os.path.join(merged, variable, "BC-" + variable + "-" + str(year) + ".tif")
        
    elif variable == "Change_Attribution" or variable == "Greatest_Change_Year":
        file_loc = os.path.join(merged, variable, "BC-" + variable + ".tif") 
        
    elif variable == "vlce":
        file_loc = os.path.join(merged, variable, "BC-" + variable + "-" + str(year) + ".tif")
        
    elif variable in elev_vars:
        file_loc = os.path.join(merged, "elevation", "BC-" + variable + ".tif")
        
    else:
        print("Not a valid variable")
        return
    
    #print(file_loc)
    
    with rasterio.open(file_loc) as rst:
        rst_data = rst.read(1)
        
    return rst_data

def get_data(mask, in_data, year):
        
    indexes = np.where(mask == 1)
    
    data_arr = in_data[indexes[0], indexes[1]]
    
    if variable in structure_vars:
        divisor = structure_key[structure_key["variable"] == variable].iloc[0]["divide_by"]
        #print(divisor)
        data_arr = data_arr / divisor
    
    return data_arr

1 min per variable-year-soi
5 variables
128 soi
1 year
10 hours

35 years
15 days


should i be including topographic variables?

# New df method

This is where all the data is loaded in, it kind of works as a raster stack that is then transformed into a DF

In [8]:
# generate latitude raster
# get x,y dimensions, ymin (bottom), and pixel size
# make a raster of equal size
# where each cell is equal to its row.
# then multiply by pixel size and add ymin
start_time = time.time()
raster_loc = os.path.join("D:\\", "Merged", "Change_Attribution", "BC-Change_Attribution.tif")

# load data and get bounds
with rasterio.open(raster_loc) as rst:
    data = rst.read(1)
    bounds = rst.bounds
    
lat_min = bounds[1]
x, y = data.shape
an_array = np.array(np.flip(np.arange(0, x)))

repeats_array = np.transpose([an_array] * y)

latitude = repeats_array
#latitude = (repeats_array * 30) + lat_min
# going to do this in R (post)
# i think the world isnt a fan of floats

# clean up
del data
del bounds
del repeats_array
del an_array
del lat_min
print("latitude loaded", time.time() - start_time)

latitude loaded 19.10013437271118


In [9]:
year = 2015

arrays = {}
start = time.time()
for variable in all_vars:
    mid = time.time()
    arrays[variable] = load_data(variable, year)
    print(variable, time.time() - mid)

change_year = load_data("Greatest_Change_Year", year)

arrays["change_year"] = change_year
arrays["latitude"] = latitude
del change_year
del latitude
end = time.time()
print(end - start)

elev 46.30174803733826
slope 57.76206064224243
elev_cv 29.236483097076416
loreys_height 31.074626207351685
percentage_first_returns_above_2m 32.356269121170044
total_biomass 33.50341033935547
vlce 17.804847478866577
Change_Attribution 4.043978691101074
281.74160289764404


In [10]:
for keys in arrays:
    print(keys)
    print(arrays[keys].shape)

elev
(45599, 53242)
slope
(45599, 53242)
elev_cv
(45598, 53241)
loreys_height
(45598, 53241)
percentage_first_returns_above_2m
(45598, 53241)
total_biomass
(45598, 53241)
vlce
(45598, 53241)
Change_Attribution
(45598, 53241)
change_year
(45598, 53241)
latitude
(45598, 53241)


In [11]:
# for the single subzone with no protected area. needs to be included in the BEC proportions figure
# only one or i would solve in functions

soi = "SBPS_mk"
year = 2015

start = time.time()
print(soi, year)

h_mask = rasterio.open(h_mask_loc)
h_mask_data = h_mask.read(1)
meta = h_mask.meta.copy()
meta["nodata"] = 0
del h_mask
del h_mask_data


csv_save = soi + "-" + str(year) + ".csv"
save_location = os.path.join("..", "data", "all_vars", csv_save)

if not os.path.isfile(save_location):
    
    np.random.seed(69420)
    
    # filter ppa for the subzone of interest
    subzone_filter = ppa_bec[ppa_bec["szs"] == soi]
    
    # select unprotected shapes
    np_sub = subzone_filter[subzone_filter["protected"] == False]
    # removes invalid geometries
    np_sub_clean = np_sub[(np_sub.geometry.type == "Polygon") | (np_sub.geometry.type == "MultiPolygon")]
    np_sub_clean = np_sub_clean.dissolve(by = "szs")
    
    # save locations
    np_raster_loc = os.path.join(scratch, "np-" + soi + ".tif")
    
    np_mask = rasterize_polygon_mask(np_sub_clean, np_raster_loc, meta)

    print("making data arrays")
    #protected = []
    unprotected = []
    for key in arrays:
        variable = key
        #protected.append(get_data(masks[0], arrays[key], year))
        unprotected.append(get_data(np_mask, arrays[key], year))

    #df_p = pd.DataFrame(np.transpose(np.vstack(protected)), columns = all_vars + ["change_year"])
    #df_p["protected"] = "protected"

    df_np = pd.DataFrame(np.transpose(np.vstack(unprotected)), columns = list(arrays.keys()))
    df_np["protected"] = "unprotected"

    print("merging and saving")

    df = df_np
    df["subzone"] = soi
    df["year"] = year

    df.to_csv(save_location, index = False)
print(time.time() - start)
        
#del arrays

SBPS_mk 2015
making data arrays
merging and saving
144.36425971984863


In [None]:
# using the whole sample from the masks, so not an equal sample when put into a csv

for soi in subzones:
    start = time.time()
    print(soi, year)
    csv_save = soi + "-" + str(year) + ".csv"
    save_location = os.path.join("..", "data", "all_vars", csv_save)
    
    if not os.path.isfile(save_location):

        print("loading masks")
        mask_locs = [os.path.join(scratch, protected + "-" + soi + ".tif") for protected in ["p", "np"]]
        mask_sum = sum([os.path.isfile(loc) for loc in mask_locs])

        if mask_sum == 2:
            masks = []

            for i in range(len(mask_locs)):
                with rasterio.open(mask_locs[i]) as rst:
                    masks.append(rst.read(1))
                    
        else:
            print("loading failed")
            print("generating new masks")
            masks = equal_sample_p_np(soi)
        
        print("making data arrays")
        protected = []
        unprotected = []
        for key in arrays:
            
            key_time = time.time()
            variable = key
            print(variable)
            protected.append(get_data(masks[0], arrays[key], year))
            unprotected.append(get_data(masks[1], arrays[key], year))
            print(time.time() - key_time)
            
        df_p = pd.DataFrame(np.transpose(np.vstack(protected)), columns = list(arrays.keys()))
        df_p["protected"] = "protected"

        df_np = pd.DataFrame(np.transpose(np.vstack(unprotected)), columns = list(arrays.keys()))
        df_np["protected"] = "unprotected"
        
        print("making dfs took", time.time() - start)
        print("merging and saving")

        df = pd.concat([df_p, df_np])
        df["subzone"] = soi
        df["year"] = year
        
        df[["vlce", "Change_Attribution", "change_year", "latitude"]] = df[["vlce", "Change_Attribution", "change_year", "latitude"]].apply(pd.to_numeric, downcast = "integer")
        
        save_time = time.time()
        df.to_csv(save_location, index = False)
        
        print("save time", time.time() - save_time)
        
    print(time.time() - start)
    print()
    #clear_output(wait = True)
        
del arrays

BAFA_un 2015
loading masks
making data arrays
elev
18.518768787384033
slope
18.4835364818573
elev_cv
19.096954107284546
loreys_height
19.73732352256775
percentage_first_returns_above_2m
20.465655088424683
total_biomass
20.890056848526
vlce
19.866174936294556
Change_Attribution
19.887165069580078
change_year
19.238274812698364
latitude
20.16164469718933
making dfs took 223.45403742790222
merging and saving
save time 520.6137571334839
806.0054888725281

BAFA_unp 2015
loading masks
making data arrays
elev
15.140332460403442
slope
15.03517484664917
elev_cv
14.824721813201904
loreys_height
14.724886894226074
percentage_first_returns_above_2m
14.660131216049194
total_biomass
14.686352014541626
vlce
14.744583368301392
Change_Attribution
15.07864785194397
change_year
14.757972478866577
latitude
15.15526795387268
making dfs took 171.08597373962402
merging and saving
save time 12.62341570854187
185.77631855010986

BG_xh 2015
loading masks
making data arrays
elev
14.818664073944092
slope
14.70908