# Land Habitat Classification features

Land Habitat information around buffer areas for each site is gathered. Each sit is then labelled according to density based clustering.

Landcover (Living England), Living England Habitat Map (Phase 4) | Natural England Open Data Geoportal (arcgis.com)
https://naturalengland-defra.opendata.arcgis.com/datasets/Defra::living-england-habitat-map-phase-4/about



In [None]:
%load_ext autoreload
%autoreload 2

# All the variables are defined in the Config file
from model_config import *
from model_packages import *
from model_utils import *

In [None]:
os.chdir('..')

# Habitat Classification Maps

The open dataset has been manually downloaded from the portal above.

The habitat classification map  uses a machine learning approach to image classification, developed under the Defra Living Maps project (SD1705 – Kilcoyne et al., 2017). The method first clusters homogeneous areas of habitat into segments, then assigns each segment to a defined list of habitat classes using Random Forest (a machine learning algorithm). The habitat probability map displays modelled likely broad habitat classifications, trained on field surveys and earth observation data from 2021 as well as historic data layers. This map is an output from Phase IV of the Living England project, with future work in Phase V (2022-23) intending to standardise the methodology and Phase VI (2023-24) to implement the agreed standardised methods.

In [None]:
# These are the steps to save a local file to be used for analysis

# data_loc=data_folder+'NE_LivingEnglandHabitatMapPhase4_SHP_Full/'

# file_loc_shp=[x for x in os.listdir(data_loc+'data') if x.split('.')[1]=='shp']
# read_shp=[gpd.read_file(data_loc+'data/'+x) for x in file_loc_shp]
# pd.concat(read_shp).reset_index(drop=True).to_pickle('data/ne_living_habitat.pkl')

In [None]:
df_ne_habitat=pd.read_pickle(data_loc_ne_habitat)

# select only areas where the probablility of the predicted habitat type for the first prediction is greater than or equal to mean prediction probability
df_ne_habitat_hgh_prob=df_ne_habitat[df_ne_habitat['A_prob']>=df_ne_habitat['A_prob'].mean()]
df_ne_habitat_hgh_prob=df_ne_habitat_hgh_prob[['A_pred','A_prob','geometry']].reset_index(drop=True)

In [None]:
# Get shapefiles for buffer around each counter site
# which falls within mainland UK

world = gpd.read_file(world_boundaries)

uk = world[world.name == 'U.K. of Great Britain and Northern Ireland'] 


sites_df=gpd.read_file(data_folder+'accessibility.shp')

sites_df=sites_df[sites_df['geom_type']=='5km buffer'].reset_index(drop=True)


sites_df_all=sites_df.copy()

del sites_df['area']

lst_a=sites_df['counter'].unique()


sites_df = sites_df[[x for x in sites_df.columns if x not in ['area']]].to_crs(crs_mtr).\
overlay(uk.to_crs(crs_mtr), how='intersection')

lst_b=sites_df['counter'].unique()

ax=uk.to_crs(crs_deg).plot(color='r',alpha=0.1)

sites_df.to_crs(crs_deg).plot(ax=ax)

In [None]:
# Get the intersection of land habitat classification prediction with buffer zones around each people counter location.
sites_df_habitat = sites_df.to_crs(crs_mtr).\
overlay(df_ne_habitat_hgh_prob.to_crs(crs_mtr), how='intersection')

sites_df_habitat.plot()

sites_df_habitat['area_habitat_sq_km']=sites_df_habitat.geometry.area/10**6

sites_df_habitat.to_pickle(data_folder+'habitat_cover_area.pkl')

In [None]:
# map to reduced number of habitat types    
sites_df_habitat['A_pred']=sites_df_habitat['A_pred'].map(habitat_dict)

# sum of areas by counter location and habitat type
sites_df_habitat_cover=sites_df_habitat.groupby(['counter','A_pred'])['area_habitat_sq_km'].sum().reset_index()

# assign a primary habitat type to each people counter location 
sites_df_habitat_cover.rename(columns={'A_pred':'primary_habitat'},inplace=True)

In [None]:
# create a pivot table to show the habitat make up, in terms of area,  of each buffer zone 
sites_df_habitat_cover_pv=sites_df_habitat_cover.pivot_table('area_habitat_sq_km', ['counter'], 'primary_habitat')

sites_df_habitat_cover_pv=sites_df_habitat_cover_pv.fillna(0)

sites_df_habitat_cover_pv.rename_axis(None, axis=1, inplace=True)

sites_df_habitat_cover_pv.columns=['primary_habitat_'+x.replace(",","").strip().replace(" ","_")\
                                   for x in sites_df_habitat_cover_pv.columns]


sites_df_habitat_cover_pv=sites_df_habitat_cover_pv.reset_index()

sites_df_habitat_cover_pv=sites_df[['counter','geometry']].merge(sites_df_habitat_cover_pv,on=['counter'])

# Density Based Clustering

In [None]:
# Get all the area columns
coordinates = sites_df_habitat_cover_pv.select_dtypes(include=np.number).values

# Clustering: we choose cluster_size based on a few experiments
# to reduce the number of sites which are classed as noise (-1).
# Determining cluster size needs to be explored further.

np.random.seed(8)
labels = HDBSCAN(min_cluster_size=9).fit(coordinates).labels_

counter_labels = collections.Counter(labels)

print(counter_labels)

# Draw convex hulls around sites belonging to a same cluster
hulls = sites_df_habitat_cover_pv[["geometry"]].to_crs(crs_deg).dissolve(by=labels).convex_hull

In [None]:
#Visualise sites and clusters they belong to

# Set up figure and axis
f, ax = plt.subplots(1, figsize=(9, 9))
# Plot individual Airbnb locations
sites_df_habitat_cover_pv.to_crs(crs_deg).plot(
    # Colour by cluster label
    column=labels,
    # Consider label as categorical
    categorical=True,
    # Add 50% of transparency
    alpha=0.95,
    # Include legend
    legend=True,
    # Draw on axis `ax`
    ax=ax,
    # Use circle as marker
    marker="o",
    # Position legend outside the map
    legend_kwds={"bbox_to_anchor": (1, 1), 'labels': [sites_df_habitat_cover_pv.labels.unique()[0], sites_df_habitat_cover_pv.labels.unique()[1], sites_df_habitat_cover_pv.labels.unique()[2]]})
# Plot convex hull polygons for each cluster label
# except that for -1 (observations classified as noise)
# hulls[hulls.index != -1].boundary.plot(color="k", ax=ax)
# Add basemap
contextily.add_basemap(
    ax,
    crs=sites_df_habitat_cover_pv.to_crs(crs_deg).crs.to_string(),
    source=contextily.providers.CartoDB.Positron,
)
# Remove axes
ax.set_axis_off()

plt.savefig(f"./outputs/habitat_class_map.png", format= 'png', dpi=300, bbox_inches='tight')

In [None]:
# Assign labels/clusters to each site
sites_df_habitat_cover_pv['labels']=labels


# column names for different area types
ftrs_names_habitat=[x for x in sites_df_habitat_cover_pv.select_dtypes(include=np.number).columns \
                    if x not in ['labels']]

# Visualisation

In [None]:
# Area make up of each people counter lcocation
df=sites_df_habitat_cover_pv.groupby(['counter'])[ftrs_names_habitat].sum().unstack().\
reset_index().sort_values(by='counter')

df.rename(columns={'level_0':'habitat',0:'area_sq_km'},inplace=True)

df2 = df.groupby(["habitat","counter"]).mean().unstack("habitat").fillna(0)

df2['area_sq_km'].sample(25).plot.barh(stacked=True,colormap='Paired',figsize=(15,10))

plt.xlabel('area_sq_km')

In [None]:
# Looking at area make-up of each cluster: this will help us
# identify a 'physical label' for each cluster
df=sites_df_habitat_cover_pv.groupby(['labels'])[ftrs_names_habitat].mean().unstack().\
reset_index().sort_values(by='labels')

df.rename(columns={'level_0':'habitat',0:'area_sq_km'},inplace=True)

df2 = df.groupby(["habitat","labels"]).mean().unstack("habitat").fillna(0)

df2['area_sq_km'].plot.barh(stacked=False,colormap='Paired',figsize=(15,5))

plt.xlabel('area_sq_km')


# Set up figure
ftrs="area_sq_km"
f = plt.figure(figsize=(10, 5))
# Add box plots of price by HDBSCAN cluster
ax = df.boxplot(
    # Plot distribution of 'price'
    ftrs,
    # Group by cluster label, generating one box plot/cluster
    by='labels',
    # Do not display individual outlier observations
    #flierprops=dict(marker=None),
    # Draw visualisation on the current axis (inside `f`)
    ax=plt.gca(),
)

# Set label for horizontal axis
ax.set_xlabel("HDBSCAN cluster (labels)")
# Set labels for vertical axis
ax.set_ylabel(ftrs)

# Remove default figure title
plt.gcf().suptitle(None)
# Remove default axis title
ax.set_title(None)
# Re-adjust vertical value range for easier legibility
#ax.set_ylim(0, 1250);

In [None]:
habt_clstr_map=dict(zip([-1,0,1],['Grassland_woodland_wetland','Grassland_woodland_coastal',
                                  'Grassland_woodland_bareground']))

sites_df_habitat_cover_pv['labels']=sites_df_habitat_cover_pv['labels'].map(habt_clstr_map)

In [None]:
# Visualisation

# Looking at area make-up of each cluster: 
# and now assigning a'physical label' for each cluster
df=sites_df_habitat_cover_pv.groupby(['labels'])[ftrs_names_habitat].mean().unstack().\
reset_index().sort_values(by='labels')

df.rename(columns={'level_0':'habitat',0:'area_sq_km'},inplace=True)

df2 = df.groupby(["habitat","labels"]).mean().unstack("habitat").fillna(0)

df2['area_sq_km'].plot.barh(stacked=True,colormap='Paired',figsize=(10, 5))


plt.ylabel('Land Habitat Classification', fontsize=16)
plt.yticks(fontsize=14, )
plt.xlabel('Area in Km$^2$', fontsize=16)
plt.xlim(0,75)
plt.legend(fontsize=14, title= 'Habitat Type', title_fontsize='large', loc='upper right')

plt.savefig(f"./outputs/habitat_class_.png", format= 'png', dpi=300, bbox_inches='tight')


In [None]:
# Set up figure
ftrs="area_sq_km"
f = plt.figure(figsize=(10, 5))
# Add box plots of price by HDBSCAN cluster
ax = df.boxplot(
    # Plot distribution of 'price'
    ftrs,
    # Group by cluster label, generating one box plot/cluster
    by='labels',
    # Do not display individual outlier observations
    #flierprops=dict(marker=None),
    # Draw visualisation on the current axis (inside `f`)
    ax=plt.gca(),
)
# Set label for horizontal axis
ax.set_xlabel("HDBSCAN cluster (labels)")
# Set labels for vertical axis
ax.set_ylabel(ftrs)

# Remove default figure title
plt.gcf().suptitle(None)
# Remove default axis title
ax.set_title(None)
# Re-adjust vertical value range for easier legibility
#ax.set_ylim(0, 1250);

In [None]:
sites_df_habitat_cover_pv.to_pickle(data_folder+'land_habitat_clusters.pkl')