This notebook retrieves relevant features from the 2011 Censys at Output Area level. The 5km buffer zones surrounding each people counter location are then intersected with census data to create a data set relevant to each location. Finally, these data sets are compiled into a single data set containing all census features.
All the data files have been downloaded from Nomis (https://www.nomisweb.co.uk)

In [None]:
%load_ext autoreload
%autoreload 2

# All the variables are defined in the Config file
from model_config import *
from model_utils import *

In [None]:
os.chdir('..')

# Retrieve Output Area shapefiles

In [None]:
# Output Area shape files

df_oa_shap_files=gpd.read_file(census_locn_file_data+'/infuse_oa_lyr_2011')

df_oa_shap_files=df_oa_shap_files[df_oa_shap_files['geo_code'].str.lower().str.startswith('e')].\
reset_index(drop=True)

df_oa_shap_files=df_oa_shap_files[['geo_code','geometry']]

df_oa_shap_files.rename(columns={'geo_code':'2011 output area'},inplace=True)

# Retreive Urban-Rural Classification of each Output Area

In [None]:
# Urban-rural classification at the Output Area 

df_urbn_rurl_oa=pd.read_csv(census_locn_file_data+'RUC11_OA11_EW.csv',skiprows=0)

df_urbn_rurl_oa=df_urbn_rurl_oa[df_urbn_rurl_oa['OA11CD'].str.lower().str.startswith('e')]

df_urbn_rurl_oa=df_urbn_rurl_oa[['OA11CD','RUC11']].reset_index(drop=True)

df_urbn_rurl_oa.rename(columns={'OA11CD':'2011 output area','RUC11':'urban_rural'},inplace=True)

In [None]:
#save Output area classification as rural-urban with shapefiles
df_oa_shap_files=df_oa_shap_files.merge(df_urbn_rurl_oa,on=['2011 output area'],how='inner')

df_oa_shap_files.to_pickle(data_folder+'urban_rural_oa.pkl')

# Retreive census features

## 1. Household occupancy 

['1 person in household', '2 people in household', '3 people in household', '4 people in household', '5 people in household', '6 people in household', '7 people in household', '8 or more people in household']

In [None]:
df_hh_oa = pd.read_csv(census_locn_file_data+'household_occupancy.csv')
df_hh_oa= df_hh_oa.set_index('2011 output area')

hh_ftrs=list(df_hh_oa.columns)


#print(df_hh_oa.sample(5))
print(hh_ftrs)

## 2. Age groups

['Age 0 to 4', 'Age 5 to 7', 'Age 8 to 9', 'Age 10 to 14', 'Age 15',
       'Age 16 to 17', 'Age 18 to 19', 'Age 20 to 24', 'Age 25 to 29',
       'Age 30 to 44', 'Age 45 to 59', 'Age 60 to 64', 'Age 65 to 74',
       'Age 75 to 84', 'Age 85 to 89', 'Age 90 and over']

In [None]:
df_age_oa = pd.read_csv(census_locn_file_data+'age_groups.csv')
df_age_oa=df_age_oa.set_index('2011 output area')

age_ftrs=df_age_oa.columns


print(age_ftrs)

## 3. Deprivation by households

['Household is not deprived in any dimension', 'Household is deprived in 1 dimension', 'Household is deprived in 2 dimensions', 'Household is deprived in 3 dimensions', 'Household is deprived in 4 dimensions']

In [None]:
df_dep_oa = pd.read_csv(census_locn_file_data+'deprivation_dimension.csv')
df_dep_oa=df_dep_oa.set_index('2011 output area')
dep_ftrs=list(df_dep_oa.columns)
df_dep_oa[dep_ftrs]=df_dep_oa[dep_ftrs].astype(float)

print(dep_ftrs)

## 4. Population density

['All usual residents', 'Schoolchild or full-time student aged 4 and over at their non term-time address', 'Area (Hectares)', 'Density (number of persons per hectare)']

In [None]:
df_res_oa = pd.read_csv(census_locn_file_data+'population_density.csv')
df_res_oa=df_res_oa.set_index('2011 output area')
res_ftrs=list(df_res_oa.columns)

df_res_oa[res_ftrs]=df_res_oa[res_ftrs].astype(float)

print(res_ftrs)

## 5. Working population

['Economically active', 'Economically Inactive',
       'Unemployed: Age 16 to 24', 'Unemployed: Age 50 to 74',
       'Unemployed: Never worked', 'Long-term unemployed']

In [None]:
df_wrk_oa= pd.read_csv(census_locn_file_data+'working_population.csv')
df_wrk_oa= df_wrk_oa.set_index('2011 output area')


wrk_ftrs=df_wrk_oa.columns

print(wrk_ftrs)

## 6. Population Health

['Very good health', 'Good health', 'Fair health', 'Bad health', 'Very bad health']

In [None]:
df_hlth_oa= pd.read_csv(census_locn_file_data+'population_health.csv')
df_hlth_oa =df_hlth_oa.set_index('2011 output area')


hlth_ftrs=list(df_hlth_oa.columns)

df_hlth_oa[hlth_ftrs]=df_hlth_oa[hlth_ftrs].astype(float)

print(hlth_ftrs)

## 7. Ethnicity

['White', 'Mixed/multiple ethnic groups', 'Asian/Asian British', 'Black/African/Caribbean/Black British', 'Other ethnic group']

In [None]:
df_ethnc_oa= pd.read_csv(census_locn_file_data+'ethnicity.csv')
df_ethnc_oa= df_ethnc_oa.set_index('2011 output area')

ethnc_ftrs=list(df_ethnc_oa.columns)

df_ethnc_oa[ethnc_ftrs]=df_ethnc_oa[ethnc_ftrs].astype(float)

print(ethnc_ftrs)

## 8. Cars per household

['No cars or vans in household', '1 car or van in household',
       '2 cars or vans in household', '3 cars or vans in household',
       '4 or more cars or vans in household']

In [None]:
df_cars_oa= pd.read_csv(census_locn_file_data+'cars.csv')
df_cars_oa= df_cars_oa.set_index('2011 output area')

vehcl_ftrs=df_cars_oa.select_dtypes(include=np.number).columns

print(vehcl_ftrs)

## Merge all Census features

In [None]:
# single dataframe with all census features
df_census_oa=pd.concat([df_hh_oa,df_age_oa,df_dep_oa,df_res_oa,df_wrk_oa,df_hlth_oa,df_ethnc_oa,df_cars_oa],axis=1).\
reset_index()


#merge with OA shapefiles on '2011 output area' to obtain geometries
df_census_oa=df_census_oa.merge(df_oa_shap_files,right_on=['2011 output area'],\
                                left_on=['2011 output area'],how='inner').dropna().reset_index(drop=True)


df_census_oa.to_pickle(data_folder+'census_oa_shapefiles.pkl')

# Obtain census features for 5km buffer zone around each people counter location

In [None]:
# Read locations of people monitoring sites 

df_sites=gpd.read_file(data_folder+'accessibility.shp')

df_sites=df_sites[df_sites['geom_type']=='5km buffer']

df_sites=df_sites[['counter','geometry']].reset_index(drop=True)

print(df_sites.shape)

In [None]:
# Find the intersections of 5km buffer areas around each people counter sites and the census information at the Output Area
df_sites_oa_intersection=df_sites.to_crs(crs_mtr).overlay(gpd.GeoDataFrame(df_census_oa).to_crs(crs_mtr),\
                                                          how='intersection')

In [None]:
#Visualise Output Area for each monitoring sites

ax=df_sites.to_crs(crs_deg).plot(alpha=1)
df_sites_oa_intersection.to_crs(crs_deg).plot(ax=ax,alpha=0.25,color='r')
    
# Add basemap
contextily.add_basemap(
ax,
crs=df_sites.to_crs(crs_deg).crs.to_string(),
source=contextily.providers.CartoDB.VoyagerNoLabels,
)
# Remove axes
ax.set_axis_off();
    
   

In [None]:
df_sites_oa_intersection['area_sq_km']=df_sites_oa_intersection.to_crs(crs_mtr).area/10**6

In [None]:
#Visualisation: Output area make up of each monitoring site 
df_sites_oa_intersection.groupby(['counter','urban_rural'])['area_sq_km'].sum().unstack().\
reset_index().sort_values(by='counter').plot.barh(x = 'counter', y = df_sites_oa_intersection['urban_rural'].unique().tolist(),\
                        stacked=True,figsize=(15, 10))

In [None]:
# Visualise distribution of area of buffer zones.
df_sites_oa_intersection.groupby('counter')['area_sq_km'].sum().hist()

# Save census feature data for each buffer zone.

In [None]:
df_sites_oa_intersection.to_pickle(data_folder+'census_oa_socio_economic_ftrs.pkl')