# Land Type Classifcation 


The 2011 rural-urban classification provides a rural/urban view of datasets at output area (OA), super output area (SOA) and ward level. Furthermore, we gather the area covered under the various labels of rural-urban classification within 5km buffer zones around each counter site. We then use density based clustering to label sites under corresponding clusters.

https://www.ons.gov.uk/methodology/geography/geographicalproducts/ruralurbanclassifications/2011ruralurbanclassification

In [None]:
%load_ext autoreload
%autoreload 2

# All the variables are defined in the Config file
from model_config import *
from model_packages import *
from model_utils import *

In [None]:
os.chdir('..')

In [None]:
# Load the dataset prepared in Prepare_Census_features notebook
# Each counter site has multiple Output Areas falling within their buffer zones

dataset=pd.read_pickle(census_locn_file)

num_ftrs='area_sq_km'

# get the cumulative area features 
dataset=dataset.groupby(['counter','urban_rural'])[num_ftrs].sum().reset_index()


# location of sites 
locations_buffer=gpd.read_file(data_folder+'accessibility.shp')

locations_buffer=locations_buffer[locations_buffer['geom_type']=='5km buffer'].reset_index(drop=True)

locations_buffer=locations_buffer.to_crs(crs_deg)

sites_profile=dataset.copy()

# get geo-location for each counter site
sites_profile=locations_buffer[['counter','geometry']].merge(sites_profile,on=['counter'],how='inner')

sites_profile['geometry']=sites_profile['geometry'].centroid

sites_profile=sites_profile[['counter','geometry','urban_rural','area_sq_km']]


print(sites_profile.sample(5))

In [None]:
# Data Wrangling to a pivot table

sites_profile_pv=sites_profile.pivot_table('area_sq_km', ['counter'], 'urban_rural')


sites_profile_pv.reset_index( drop=False, inplace=True )

colm_nams=sites_profile_pv.columns

sites_profile_pv=sites_profile_pv.reindex(colm_nams, axis=1).fillna(0)


sites_profile_pv = sites_profile_pv.rename_axis(None, axis=1)

print(sites_profile_pv.sample(5))


# Gather geolocation for each site and tag it to the pivot table

sites_profile=sites_profile_pv.copy()


sites_profile=locations_buffer[['counter','geometry']].merge(sites_profile,on=['counter'],how='inner')

sites_profile['geometry']=sites_profile['geometry'].centroid

In [None]:
#collect all the numerical features for clustering

coordinates = sites_profile.select_dtypes(include=np.number).values

# perform density based clustering on the area features
# to assign a label to each site
# Clustering: we choose cluster_size based on a few experiments
# to reduce the number of sites which are classed as noise (-1).
# Determining cluster size needs to be explored further.

np.random.seed(8)
labels = HDBSCAN(min_cluster_size=7).fit(coordinates).labels_

counter = collections.Counter(labels)

print(counter)

# Draw convex hulls around sites belonging to a same cluster
hulls = sites_profile[["geometry"]].to_crs(crs_deg).dissolve(by=labels).convex_hull

In [None]:
sites_profile.labels.unique()[0]

In [None]:
# Draw counter site-color coded to different clusters

# Set up figure and axis
f, ax = plt.subplots(1, figsize=(9, 9))
# Plot individual Airbnb locations
sites_profile.to_crs(crs_deg).plot(
    # Colour by cluster label
    column=labels,
    # Consider label as categorical
    categorical=True,
    # Add 50% of transparency
    alpha=0.95,
    # Include legend
    legend=True,
    # Draw on axis `ax`
    ax=ax,
    # Use circle as marker
    marker="o",
    # Position legend outside the map
    legend_kwds={"bbox_to_anchor": (1, 1), 'labels': [sites_profile.labels.unique()[0], sites_profile.labels.unique()[2], sites_profile.labels.unique()[1]]},
)
# Plot convex hull polygons for each cluster label
# except that for -1 (observations classified as noise)
# hulls[hulls.index != -1].boundary.plot(color="k", ax=ax)
# Add basemap
contextily.add_basemap(
    ax,
    crs=sites_profile.to_crs(crs_deg).crs.to_string(),
    source=contextily.providers.CartoDB.Positron,
)
# Remove axes
ax.set_axis_off()
# ax.legend(['Rural','Urban','Rural-Urban Mixed'])

f.savefig(f"./outputs/ru_class_map.png", format= 'png', dpi=300, bbox_inches='tight')

In [None]:
#Assign labels to each site
sites_profile['labels']=labels

ftrs_names_ur_rural=[ x for x in sites_profile.select_dtypes(include=np.number).columns if x not in ['labels']]

# Compute the mean area in each of the rural-urban class 
# falling under each cluster
df=sites_profile.groupby(['labels'])[ftrs_names_ur_rural].mean().unstack().\
reset_index().sort_values(by='labels')

df.rename(columns={'level_0':'urban_rural',0:'area_sq_km'},inplace=True)

df2 = df.groupby(["urban_rural","labels"]).sum().unstack("urban_rural").fillna(0)



df2['area_sq_km'].plot.barh(stacked=True,colormap='Paired',figsize=(15,10))

plt.xlabel('area_sq_km')



In [None]:
# Set up figure
ftrs="area_sq_km"
f = plt.figure(figsize=(10, 5))
# Add box plots of price by HDBSCAN cluster
ax = df.boxplot(
    # Plot distribution of 'price'
    ftrs,
    # Group by cluster label, generating one box plot/cluster
    by='labels',
    # Do not display individual outlier observations
    #flierprops=dict(marker=None),
    # Draw visualisation on the current axis (inside `f`)
    ax=plt.gca(),
)

# Set label for horizontal axis
ax.set_xlabel("HDBSCAN cluster (labels)")
# Set labels for vertical axis
ax.set_ylabel(ftrs)

# Remove default figure title
plt.gcf().suptitle(None)
# Remove default axis title
ax.set_title(None)
# Re-adjust vertical value range for easier legibility
#ax.set_ylim(0, 1250);

In [None]:
# We assign labels based on the area type with the largest area in each cluster

# For example cluster 1 is mostly urban

urbn_url_clstr_map=dict(zip([-1,0,1],['rural_mixed_settings','urban_settings','rural_settings']))

sites_profile['labels']=sites_profile['labels'].map(urbn_url_clstr_map)

sites_profile.to_pickle(data_folder+'/rural_urban_clusters.pkl')

In [None]:
# Area make up around each people counter site (within a buffer zone of 5km)

df=sites_profile.groupby(['counter'])[ftrs_names_ur_rural].sum().unstack().\
reset_index().sort_values(by='counter')


df.rename(columns={'level_0':'land_type',0:'area_sq_km'},inplace=True)

df2 = df.groupby(["land_type","counter"]).sum().unstack("land_type").fillna(0)


# we are plotting a sample of the dataset

df2['area_sq_km'].sample(25).plot.barh(stacked=True,colormap='Paired',figsize=(15,10))

plt.xlabel('area_sq_km')

In [None]:
# Compute the mean area in each of the rural-urban class 
# falling under each cluster: now the cluster labels have a 
# more interpretable label

df=sites_profile.groupby(['labels'])[ftrs_names_ur_rural].mean().unstack().\
reset_index().sort_values(by='labels')

df.rename(columns={'level_0':'land_type',0:'area_sq_km'},inplace=True)

df2 = df.groupby(["land_type","labels"]).sum().unstack("land_type").fillna(0)

df2['area_sq_km'].plot.barh(stacked=True,colormap='Paired',figsize=(15,10))
plt.ylabel('Rural-Urban Classification', fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel('Area in Km$^2$', fontsize=16)
plt.xlim(0,175)
plt.legend(fontsize=16, title= 'Land Type', title_fontsize='large', loc='upper right')

plt.savefig(f"./outputs/ru_class.png", format= 'png', dpi=300, bbox_inches='tight')

In [None]:
# Set up figure
ftrs="area_sq_km"
f = plt.figure(figsize=(8, 3))
# Add box plots of price by HDBSCAN cluster
ax = df.boxplot(
    # Plot distribution of 'price'
    ftrs,
    # Group by cluster label, generating one box plot/cluster
    by='labels',
    # Do not display individual outlier observations
    #flierprops=dict(marker=None),
    # Draw visualisation on the current axis (inside `f`)
    ax=plt.gca(),
)

# Set label for horizontal axis
ax.set_xlabel("HDBSCAN cluster")
# Set labels for vertical axis
ax.set_ylabel(ftrs)

# Remove default figure title
plt.gcf().suptitle(None)
# Remove default axis title
ax.set_title(None)
# Re-adjust vertical value range for easier legibility
#ax.set_ylim(0, 1250);