In [1]:
import pandas
import datetime

### Output

This defines a dataframe `covid`:
* One row per date and ltla (Lower Tier Local Authority); the dates are consecutive and span the full daterange for each ltla
* Columns for cumulative number of cases (by "specimen date", not by reporting date), and cumulative deaths (by death date, not by reporting date). I expect the last few days are undercounting, because of late reporting
* Extra columns containing the population for each ltla, and also its utla and region

### Definitions
In England, the structure is msoa < ltla < utla ("upper tier local authority") < region. Cambridge is an example of an ltla. In the rest of the UK, we only have ltla, and I've set utla to be equal to ltla. (The msoa file is used only to work out the ltla < utla < region structure for England.)

### Input data
* `ltla`: COVID data from [coronavirus.data.gov.uk](https://coronavirus.data.gov.uk/details/download)
* `msoa`: more COVID data from the same source, at the granularity of neighbourhoods, but only for England
* `popn`: population for each LTLA from [ons.gov.uk](https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/populationestimatesforukenglandandwalesscotlandandnorthernireland); using the "MYE1 Persons" sheet of the downloaded spreadsheet

In [2]:
PREFIX = 'https://www.cl.cam.ac.uk/teaching/2021/DataSci/data/'
ltla = pandas.read_csv(PREFIX + 'ltla_2021-01-22.csv')
msoa = pandas.read_csv(PREFIX + 'msoa_2021-01-22.csv')
popn = pandas.read_csv(PREFIX + 'ukmidyearestimates20192019ladcodes.csv')

In [13]:
# Figure out the geographical hierarchy, to get (ltla,utla,region) codes
# -- for England: get utla and region from the msoa file
# -- for Scotland/Wales/NI: utla=ltla; use the first letter of the ltla code to find region
area_eng = msoa.rename(columns={'LtlaCode':'ltla', 'UtlaCode':'utla', 'regionCode':'region'})[['ltla','utla','region']].drop_duplicates()
regions = popn.loc[popn.Geography=='Country']['Code'].drop_duplicates()
regions = pandas.DataFrame({'region':regions, 'letter':regions.str.slice(0,1)})
area_other = ltla.rename(columns={'areaCode':'ltla'})[['ltla']].drop_duplicates()
area_other['letter'] = area_other.ltla.str.slice(0,1)
area_other = area_other.loc[area_other.letter != 'E']
area_other = area_other.merge(regions, on='letter', how='left')
area_other = pandas.DataFrame({'ltla':area_other.ltla, 'utla':area_other.ltla, 'region':area_other.region})
area = pandas.concat([area_other, area_eng], ignore_index=True)

# Merge in population data and descriptive names
area = area.merge(pandas.DataFrame({'ltla':popn.Code, 'ltlaName':popn.Name, 'popn':popn['All ages']}), how='left')
area = area.merge(pandas.DataFrame({'utla':popn.Code, 'utlaName':popn.Name}), how='left')
area = area.merge(pandas.DataFrame({'region':popn.Code, 'regionName':popn.Name}), how='left')
assert not any(pandas.isna(area.popn))
assert not any(pandas.isna(area.ltlaName))
assert not any(pandas.isna(area.utlaName))
assert not any(pandas.isna(area.regionName))

# Using the ltla COVID dataset, ensure we have one record for every ltla at every date
ltla['date'] = pandas.to_datetime(ltla.date)

# First, create a DataFrame listing every possible combination of ltla and date
all_dates = pandas.date_range(start=min(ltla.date), end=max(ltla.date) - datetime.timedelta(days=1))
all_ltla = ltla.areaCode.unique()
df = pandas.DataFrame({'date':all_dates,'dummy':1}).merge(pandas.DataFrame({'ltla':all_ltla,'dummy':1}))
del df['dummy']
# Merge it with the ltla data (with renamed columns)
covid = ltla.rename(columns={'areaCode':'ltla', 'cumCasesBySpecimenDate':'cum_cases', 'cumDeaths28DaysByDeathDate':'cum_deaths'})
covid = covid[['ltla','date','cum_cases','cum_deaths']].merge(df, how='right')
# If there are any NA, fill them in from the preceding non-NA.
# If there is no prececing non-NA, fill with 0.
covid = covid.sort_values(['ltla','date'])
covid['cum_cases'] = covid.groupby('ltla')['cum_cases'].transform(lambda x: x.fillna(method='ffill'))
covid['cum_deaths'] = covid.groupby('ltla')['cum_deaths'].transform(lambda x: x.fillna(method='ffill'))
covid['cum_cases'] = covid.cum_cases.fillna(value=0).astype(int)
covid['cum_deaths'] = covid.cum_deaths.fillna(value=0).astype(int)

# Merge with area data
covid = covid.merge(area, how='left')
assert not any(pandas.isna(covid.popn))
assert not any(pandas.isna(covid.cum_cases))

In [14]:
# Look at the last 10 days for the entire UK, i.e. sum over all ltla
covid.groupby('date')[['cum_cases','cum_deaths']].apply(sum).reset_index()[-10:]

Unnamed: 0,date,cum_cases,cum_deaths
348,2021-01-12,3241235,82455
349,2021-01-13,3286083,83516
350,2021-01-14,3328026,84563
351,2021-01-15,3367970,85558
352,2021-01-16,3398058,86612
353,2021-01-17,3426459,87632
354,2021-01-18,3470223,88660
355,2021-01-19,3507473,89647
356,2021-01-20,3534355,90411
357,2021-01-21,3538995,90726
