In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from tqdm import tqdm
import time
from scipy.spatial.distance import jensenshannon


import seaborn as sns
sns.set_style("whitegrid")


import urllib3
import json

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# Get data from nomis

In [27]:
boroughs = {
    'Camden': 1946157246,
    'City of London': 1946157247,
    'Hackney': 1946157248,
    'Haringey': 1946157250,
    'Islington': 1946157251,
    'Kensington and Chelsea': 1946157252,
    'Lambeth': 1946157253,
    'Lewisham': 1946157254,
    'Newham': 1946157255,
    'Southwark': 1946157256,
    'Tower Hamlets': 1946157257,
    'Wandsworth': 1946157258,
    'Westminster': 1946157259,
    'Barking and Dagenham': 1946157260,
    'Barnet': 1946157261,
    'Bexley': 1946157262,
    'Brent': 1946157263,
    'Bromley': 1946157264,
    'Croydon': 1946157265,
    'Ealing': 1946157266,
    'Enfield': 1946157267,
    'Greenwich': 1946157268,
    'Harrow': 1946157269,
    'Havering': 1946157270,
    'Hillingdon': 1946157271,
    'Hounslow': 1946157272,
    'Kingston upon Thames': 1946157273,
    'Merton': 1946157274,
    'Redbridge': 1946157275,
    'Richmond upon Thames': 1946157276,
    'Sutton': 1946157277,
    'Waltham Forest': 1946157278,
    'Hammersmith and Fulham': 1946157249,
}

In [28]:
http = urllib3.PoolManager()

In [29]:
dct = {}

for b in tqdm(boroughs):

    res = http.request("GET", "https://www.nomisweb.co.uk/api/v01/dataset/NM_608_1.data.csv?geography={}&measures=20100".format(boroughs[b]))

    with open('tmp.txt', 'w') as f:
        f.write(res.data.decode('utf-8'))

    df = pd.read_csv('tmp.txt')

    white = df['OBS_VALUE'].iloc[1]
    mixed = df['OBS_VALUE'].iloc[6]
    asian = df['OBS_VALUE'].iloc[11]
    black = df['OBS_VALUE'].iloc[17]
    other = df['OBS_VALUE'].iloc[21]

    vals = [0, white, mixed, asian, black, other]
    vals[0] = np.sum(vals[1:])

    dct[b] = vals

    time.sleep(1)

100%|██████████| 33/33 [00:41<00:00,  1.26s/it]


In [30]:
idx = ['Total', 'White', 'Mixed', 'Asian', 'Black', 'Other']

In [31]:
df = pd.DataFrame(dct, index=idx)

In [32]:
df.to_csv('../../data/testing/nomis/ethnicity.csv')

# Compute values for synthetic data

In [2]:
boroughs = ['Camden', 'City of London', 'Hackney', 'Haringey', 'Islington', 'Kensington and Chelsea', 'Lambeth', 'Lewisham', 'Newham', 'Southwark', 'Tower Hamlets', 'Wandsworth', 'Westminster', 'Barking and Dagenham', 'Barnet', 'Bexley', 'Brent', 'Bromley', 'Croydon', 'Ealing', 'Enfield', 'Greenwich', 'Harrow', 'Havering', 'Hillingdon', 'Hounslow', 'Kingston upon Thames', 'Merton', 'Redbridge', 'Richmond upon Thames', 'Sutton', 'Waltham Forest', 'Hammersmith and Fulham']
idx = ['Total', 'White', 'Mixed', 'Asian', 'Black', 'Other']

In [3]:
# ciDATGAN
dct = {}

for b in boroughs:
    df = pd.read_csv('../../data/ciDATGAN/{}.csv'.format(b))

    vals = [len(df)]
    for i in idx[1:]:
        vals.append(len(df[df['ethnicity'] == i]))

    dct[b] = vals

In [4]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing/ciDATGAN/ethnicity.csv')

In [5]:
# DATGAN
dct = {}

for b in boroughs:
    df = pd.read_csv('../../data/DATGAN/{}.csv'.format(b))

    vals = [len(df)]
    for i in idx[1:]:
        vals.append(len(df[df['ethnicity'] == i]))

    dct[b] = vals

In [6]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing/DATGAN/ethnicity.csv')

In [40]:
# oversample
dct = {}

for b in boroughs:
    df = pd.read_csv('../../data/oversample/{}.csv'.format(b))

    vals = [len(df)]
    for i in idx[1:]:
        vals.append(len(df[df['ethnicity'] == i]))

    dct[b] = vals

In [41]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing/oversample/ethnicity.csv')

# Compare

In [2]:
orig = pd.read_csv('../../data/testing/nomis/ethnicity.csv', index_col=0)

In [3]:
files = [
    '../../data/testing/ciDATGAN/ethnicity.csv',
    '../../data/testing/DATGAN/ethnicity.csv',
    '../../data/testing/oversample/ethnicity.csv'
]

dfs = [pd.read_csv(f, index_col=0) for f in files]
labels = ['ciDATGAN', 'DATGAN', 'Oversample LTDS']

In [4]:
orig = orig/orig.iloc[0]
orig = orig.iloc[1:-1]

errors = {}

for (df, l) in zip(dfs, labels):
    df = df/df.iloc[0]
    df = df.iloc[1:-1]

    errors[l] = []

    for c in orig.columns:
        errors[l].append(jensenshannon(df[c], orig[c]))

In [5]:
for l in labels:
    print('{}: {:.2e} \pm {:.2e}'.format(l, np.mean(errors[l]), np.std(errors[l])))

ciDATGAN: 1.32e-01 \pm 5.57e-02
DATGAN: 1.30e-01 \pm 5.68e-02
Oversample LTDS: 5.90e-02 \pm 1.90e-02
