In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from tqdm import tqdm
import time


import seaborn as sns
sns.set_style("whitegrid")


import urllib3
import json

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# Get data from nomis

In [2]:
boroughs = {
    'Camden': 1946157246,
    'City of London': 1946157247,
    'Hackney': 1946157248,
    'Haringey': 1946157250,
    'Islington': 1946157251,
    'Kensington and Chelsea': 1946157252,
    'Lambeth': 1946157253,
    'Lewisham': 1946157254,
    'Newham': 1946157255,
    'Southwark': 1946157256,
    'Tower Hamlets': 1946157257,
    'Wandsworth': 1946157258,
    'Westminster': 1946157259,
    'Barking and Dagenham': 1946157260,
    'Barnet': 1946157261,
    'Bexley': 1946157262,
    'Brent': 1946157263,
    'Bromley': 1946157264,
    'Croydon': 1946157265,
    'Ealing': 1946157266,
    'Enfield': 1946157267,
    'Greenwich': 1946157268,
    'Harrow': 1946157269,
    'Havering': 1946157270,
    'Hillingdon': 1946157271,
    'Hounslow': 1946157272,
    'Kingston upon Thames': 1946157273,
    'Merton': 1946157274,
    'Redbridge': 1946157275,
    'Richmond upon Thames': 1946157276,
    'Sutton': 1946157277,
    'Waltham Forest': 1946157278,
    'Hammersmith and Fulham': 1946157249,
}

In [3]:
http = urllib3.PoolManager()

In [4]:
dct = {}

for b in tqdm(boroughs):

    res = http.request("GET", "https://www.nomisweb.co.uk/api/v01/dataset/NM_621_1.data.csv?geography={}&measures=20100".format(boroughs[b]))

    with open('tmp.txt', 'w') as f:
        f.write(res.data.decode('utf-8'))

    df = pd.read_csv('tmp.txt')

    vals = df['OBS_VALUE'].iloc[0:7].values

    dct[b] = vals

    time.sleep(1)

100%|██████████| 33/33 [00:36<00:00,  1.12s/it]


In [5]:
idx = ['Total', '0', '1', '2', '3', '4+', '#vehicles']

In [6]:
df = pd.DataFrame(dct, index=idx)

In [7]:
df.to_csv('../../data/testing/nomis/car_van.csv')

# Compute values for synthetic data

In [2]:
boroughs = ['Camden', 'City of London', 'Hackney', 'Haringey', 'Islington', 'Kensington and Chelsea', 'Lambeth', 'Lewisham', 'Newham', 'Southwark', 'Tower Hamlets', 'Wandsworth', 'Westminster', 'Barking and Dagenham', 'Barnet', 'Bexley', 'Brent', 'Bromley', 'Croydon', 'Ealing', 'Enfield', 'Greenwich', 'Harrow', 'Havering', 'Hillingdon', 'Hounslow', 'Kingston upon Thames', 'Merton', 'Redbridge', 'Richmond upon Thames', 'Sutton', 'Waltham Forest', 'Hammersmith and Fulham']
idx = ['Total', '0', '1', '2', '3', '4+', '#vehicles']

In [3]:
# ciDATGAN
dct = {}

for b in boroughs:
    df = pd.read_csv('../../data/ciDATGAN/{}.csv'.format(b))

    vals = []
    vals.append(np.sum(1/df['hh_people']))
    for i in range(0, 4):
        vals.append(np.sum(1/df[df['hh_carvan'] == i]['hh_people']))
    vals.append(np.sum(1/df[df['hh_carvan'] >= 4]['hh_people']))

    vals.append(np.sum(df['hh_carvan']/df['hh_people']))

    dct[b] = vals

In [4]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing/ciDATGAN/car_van.csv')

In [5]:
# DATGAN
dct = {}

for b in boroughs:
    df = pd.read_csv('../../data/DATGAN/{}.csv'.format(b))

    vals = []
    vals.append(np.sum(1/df['hh_people']))
    for i in range(0, 4):
        vals.append(np.sum(1/df[df['hh_carvan'] == i]['hh_people']))
    vals.append(np.sum(1/df[df['hh_carvan'] >= 4]['hh_people']))

    vals.append(np.sum(df['hh_carvan']/df['hh_people']))

    dct[b] = vals

In [6]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing/DATGAN/car_van.csv')

In [41]:
# oversample
dct = {}

for b in boroughs:
    df = pd.read_csv('../../data/oversample/{}.csv'.format(b))

    vals = []
    vals.append(np.sum(1/df['hh_people']))
    for i in range(0, 4):
        vals.append(np.sum(1/df[df['hh_carvan'] == i]['hh_people']))
    vals.append(np.sum(1/df[df['hh_carvan'] >= 4]['hh_people']))

    vals.append(np.sum(df['hh_carvan']/df['hh_people']))

    dct[b] = vals

In [42]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing/oversample/car_van.csv')

# Compare

In [7]:
orig = pd.read_csv('../../data/testing/nomis/car_van.csv', index_col=0)

In [9]:
files = [
    '../../data/testing/ciDATGAN/car_van.csv',
    '../../data/testing/DATGAN/car_van.csv',
    '../../data/testing/oversample/car_van.csv'
]

dfs = [pd.read_csv(f, index_col=0) for f in files]
labels = ['ciDATGAN', 'DATGAN', 'Oversample LTDS']

In [10]:
orig = orig/orig.iloc[0]*100

errors = {}

for (df, l) in zip(dfs, labels):
    df = df/df.iloc[0]*100
    diff = np.abs(orig-df)

    errors[l] = np.mean(diff, axis=1)

In [11]:
err = pd.DataFrame(errors)
err

Unnamed: 0,ciDATGAN,DATGAN,Oversample LTDS
Total,0.0,0.0,0.0
0,13.892897,13.361371,11.451661
1,10.280735,9.694071,11.147323
2,4.573041,4.018242,1.941138
3,0.492025,0.984506,0.749958
4+,0.203231,0.440105,0.421912
#vehicles,17.59187,17.925355,10.038484


In [12]:
err.mean()

ciDATGAN           6.719114
DATGAN             6.631950
Oversample LTDS    5.107211
dtype: float64