# Build LSOA data for England and Wales

## Step 1: Fetch the source datasets from their respective URLs

In [None]:
! wget -O imd.xlsx https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/833970/File_1_-_IMD2019_Index_of_Multiple_Deprivation.xlsx
! wget -O rural_urban.csv http://geoportal1-ons.opendata.arcgis.com/datasets/276d973d30134c339eaecfc3c49770b3_0.csv
! wget -O townsend.csv http://s3-eu-west-1.amazonaws.com/statistics.digitalresources.jisc.ac.uk/dkan/files/Townsend_Deprivation_Scores/Scores/Scores-%202011%20UK%20LSOA.csv

## Step 2: Set the path and name of the output exetera dataset

In [None]:
lsoa_dataset_name = # resulting dataset name here

## Step 3: Create the dataset from the source csv and excel files

In [None]:
import numpy as np
import pandas as pd
from exetera.core.session import Session
from exetera.core.utils import Timer

with Timer("Creating joined imd, townsend and rural_urban dataset", new_line=True):
    imd_df = pd.read_excel('./imd.xlsx', sheet_name=1)
    tsend_df = pd.read_csv('./townsend.csv')
    ruc_df = pd.read_csv('./rural_urban.csv')

    imd_df = imd_df.rename(columns={"LSOA code (2011)": "lsoa11cd",
                                    "LSOA name (2011)": "lsoa11nm",
                                    "Index of Multiple Deprivation (IMD) Rank": "imd_rank",
                                    "Index of Multiple Deprivation (IMD) Decile": "imd_decile"})
    imd_df = imd_df.drop(columns=[c for c in imd_df.columns if c not in ("lsoa11cd", "lsoa11nm", "imd_rank", "imd_decile")])
    print(imd_df)

    tsend_df = tsend_df.rename(columns={"GEO_CODE": "lsoa11cd", "TDS": "townsend_score", "quintile": "townsend_quintile"})
    tsend_df = tsend_df.drop(columns=[c for c in tsend_df.columns if c not in ("lsoa11cd", "townsend_score", "townsend_quintile")])

    ruc_df = ruc_df.rename(columns={"LSOA11CD": "lsoa11cd", "RUC11CD": "ruc11cd", "RUC11": "ruc11desc"})
    ruc_df = ruc_df.drop(columns=[c for c in ruc_df.columns if c not in ("lsoa11cd", "ruc11cd", "ruc11desc")])

    imd_tsend_df = pd.merge(left_on="lsoa11cd", right_on="lsoa11cd", left=imd_df, right=tsend_df, how="outer")
    imd_tsend_ruc_df = pd.merge(left_on="lsoa11cd", right_on="lsoa11cd", left=imd_tsend_df, right=ruc_df, how="outer")

    imd_filter = np.logical_not(imd_tsend_ruc_df["imd_rank"].isnull().to_numpy())
    townsend_filter = np.logical_not(imd_tsend_ruc_df["townsend_score"].isnull().to_numpy())
    ruc_filter = np.logical_not(imd_tsend_ruc_df["ruc11cd"].isnull().to_numpy())

with Session() as s:
    with Timer("Importing data to '{}''".format(lsoa_dataset_name), new_line=True):
        lsoa = s.open_dataset(lsoa_dataset_name, 'w', 'lsoa')
        lsoa = lsoa.create_group('lsoa11')
        lsoa11cd_len = imd_tsend_ruc_df['lsoa11cd'].map(len).max()
        s.create_fixed_string(lsoa, 'lsoa11cd', lsoa11cd_len).data.write(
            imd_tsend_ruc_df['lsoa11cd'].to_numpy('S{}'.format(lsoa11cd_len)))

        s.create_indexed_string(lsoa, 'lsoa11nm').data.write(
            imd_tsend_ruc_df['lsoa11nm'].fillna('').to_list())

        s.create_numeric(lsoa, 'imd_rank', 'int32').data.write(
            imd_tsend_ruc_df['imd_rank'].fillna(-1))

        s.create_numeric(lsoa, 'imd_decile', 'int8').data.write(
            imd_tsend_ruc_df['imd_decile'].fillna(-1))

        s.create_numeric(lsoa, 'townsend_score', 'float32').data.write(
            imd_tsend_ruc_df['townsend_score'].fillna(np.inf))

        s.create_numeric(lsoa, 'townsend_quintile', 'int8').data.write(
            imd_tsend_ruc_df['townsend_quintile'].fillna(-1))

        unique_combs = set()
        for i in range(len(imd_tsend_ruc_df)):
            unique_combs.add((imd_tsend_ruc_df['ruc11cd'][i], imd_tsend_ruc_df['ruc11desc'][i]))

        ruc11cd = imd_tsend_ruc_df['ruc11cd'].fillna('')
        ruc_map = {'': 0, 'A1': 1, 'B1': 2, 'C1': 3, 'C2': 4, 'D1': 5, 'D2': 6, 'E1': 7, 'E2': 8}
        ruc11cd_cat = np.zeros(len(ruc11cd))
        for i in range(len(ruc11cd)):
            ruc11cd_cat[i] = ruc_map[ruc11cd[i]]
        s.create_categorical(lsoa, 'ruc11cd', 'int8', ruc_map).data.write(ruc11cd_cat)

        ruc11desc = imd_tsend_ruc_df['ruc11desc'].fillna('')
        rucdesc_map = {'': 0, 'Urban major conurbation': 1, 'Urban minor conurbation': 2,
                       'Urban city and town': 3, 'Urban city and town in a sparse setting': 4,
                       'Rural town and fringe': 5, 'Rural town and fringe in a sparse setting': 6,
                       'Rural village and dispersed': 7,  'Rural village and dispersed in a sparse setting': 8}
        ruc11desc_cat = np.zeros(len(ruc11desc))
        for i in range(len(ruc11desc)):
            ruc11desc_cat[i] = rucdesc_map[ruc11desc[i]]
        s.create_categorical(lsoa, 'ruc11desc', 'int8', rucdesc_map).data.write(ruc11desc_cat)

        s.create_numeric(lsoa, 'has_imd_data', 'bool').data.write(imd_filter)
        print("imd data for {} of {} entries".format(imd_filter.sum(), len(imd_filter)))
        s.create_numeric(lsoa, 'has_townsend_data', 'bool').data.write(townsend_filter)
        print("townsend data for {} of {} entries".format(townsend_filter.sum(), len(townsend_filter)))
        s.create_numeric(lsoa, 'has_rural_urban_data', 'bool').data.write(ruc_filter)
        print("ruc data for {} of {} entries".format(ruc_filter.sum(), len(ruc_filter)))

    with Timer("sorting data by lsoa11cd", new_line=True):
        s.sort_on(lsoa, lsoa, ('lsoa11cd',))