In [1]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns

print(f'pandas: {pd.__version__}')
print(f'numpy: {np.__version__}')
print(f'matplotlib: {matplotlib.__version__}')
print(f'seaborn: {sns.__version__}')

pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', str)

pandas: 2.1.1
numpy: 1.26.0
matplotlib: 3.8.0
seaborn: 0.13.0


# Load Index of Multiple Deprivation (IMD) Data

Index of Multiple Deprivation (IMD) is a measure of relative deprivation for small areas in England. It is used to identify areas of deprivation and target public services to meet local needs. The Index of Multiple Deprivation 2019 (IMD 2019) is the most up-to-date measure of relative deprivation for small areas in England. It available at lower layer super output area (LSOA) level and is based on a range of data sources, including the 2011 Census, the Index of Child Deprivation 2019 (ICD 2019) and the English Indices of Deprivation 2019 (EID 2019). The dataset can be downloaded from the [Office for National Statistics (ONS)](https://www.gov.uk/government/statistics/english-indices-of-deprivation-2019).

In [2]:
def load_imd_data():
    df = pd.read_excel('./data/input/File_2_-_IoD2019_Domains_of_Deprivation.xlsx', header=1, sheet_name=1)

    # Only keep the columns we need
    df = df.iloc[:, [0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]]
    df.columns = ['LSOA', 'LSOAName', 'IMD', 'IMDDecile', 'Inc', 'IncDecile', 'Emp', 'EmpDecile', 'Edu', 'EduDecile', 'Crm', 'CrmDecile', 'HouseBar', 'HouseBarDecile', 'Env', 'EnvDecile']

    df.set_index('LSOA', inplace=True)
    df = df.add_prefix('IMD_')
    return df

df = load_imd_data()
df.head()

Unnamed: 0_level_0,IMD_LSOAName,IMD_IMD,IMD_IMDDecile,IMD_Inc,IMD_IncDecile,IMD_Emp,IMD_EmpDecile,IMD_Edu,IMD_EduDecile,IMD_Crm,IMD_CrmDecile,IMD_HouseBar,IMD_HouseBarDecile,IMD_Env,IMD_EnvDecile
LSOA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
E01000002,City of London 001B,30379,10,29901,10,31190,10,32832,10,29705,10,32789,10,11707,4
E01000003,City of London 001C,14915,5,18510,6,15103,5,26386,9,17600,6,29363,9,2157,1
E01000005,City of London 001E,8678,3,6029,2,7833,3,12370,4,17907,6,31059,10,2217,1
E01000006,Barking and Dagenham 016A,14486,5,14023,5,21692,7,17511,6,21581,7,18848,6,1033,1
E01000007,Barking and Dagenham 015A,7256,3,6261,2,11487,4,20536,7,16414,5,4925,2,274,1


In [3]:
from modules.utils import load_saved_data

ppd_df = load_saved_data('1_ppd_epc_data')

Loading saved data from ./data/saved/1_ppd_epc_data.parquet...


In [4]:
def enrich_with_imd_data(ppd_df, imd_df):
    ppd_df = ppd_df.reset_index()[['PPD_ID', 'ONSUD_LSOA']]

    ppd_df = ppd_df.merge(imd_df, how='left', left_on='ONSUD_LSOA', right_index=True)
    ppd_df.drop(columns=['ONSUD_LSOA'], inplace=True) 
    ppd_df.set_index('PPD_ID', inplace=True)

    return ppd_df

enriched_ppd_df = enrich_with_imd_data(ppd_df, df)

In [5]:
from modules.utils import save_data

# Save it for later
save_data(enriched_ppd_df, '6_ppd_imd_data')

Saving data to ./data/saved/6_ppd_imd_data.parquet...


In [6]:
# enriched_ppd_df.info()

enriched_ppd_df['IMD_CrmDecile'].describe()

count            348910.0
mean     7.05891490642286
std     2.293903585135633
min                   1.0
25%                   5.0
50%                   7.0
75%                   9.0
max                  10.0
Name: IMD_CrmDecile, dtype: float64