In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from tqdm import tqdm
import time


import seaborn as sns
sns.set_style("whitegrid")


import urllib3
import json

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# Update data from nomis

In [15]:
df = pd.read_csv('../../data/testing/nomis/car_van.csv', index_col=0)

In [16]:
regions = {
    'Central London': ['City of London', 'Camden', 'Kensington and Chelsea', 'Islington', 'Westminster', 'Southwark', 'Lambeth'],
    'South London': ['Bromley', 'Croydon', 'Kingston upon Thames', 'Merton', 'Sutton', 'Wandsworth'],
    'East London': ['Barking and Dagenham', 'Bexley', 'Greenwich', 'Hackney', 'Havering', 'Lewisham', 'Newham', 'Tower Hamlets', 'Waltham Forest'],
    'North London': ['Barnet', 'Enfield', 'Haringey'],
    'West London': ['Brent', 'Ealing', 'Hammersmith and Fulham', 'Harrow', 'Hillingdon', 'Hounslow', 'Redbridge', 'Richmond upon Thames']
}

In [17]:
dct = {}

for r in regions:
    dct[r] = df[regions[r]].sum(axis=1).values

In [18]:
df = pd.DataFrame(dct, index=df.index)

In [19]:
df.to_csv('../../data/testing_region/nomis/car_van.csv')

# Compute values for synthetic data

In [7]:
regions = ['Central London', 'South London', 'East London', 'North London', 'West London']
idx = ['Total', '0', '1', '2', '3', '4+', '#vehicles']

In [8]:
# ciDATGAN
dct = {}

for b in regions:
    df = pd.read_csv('../../data/ciDATGAN/{}.csv'.format(b))

    vals = []
    vals.append(np.sum(1/df['hh_people']))
    for i in range(0, 4):
        vals.append(np.sum(1/df[df['hh_carvan'] == i]['hh_people']))
    vals.append(np.sum(1/df[df['hh_carvan'] >= 4]['hh_people']))

    vals.append(np.sum(df['hh_carvan']/df['hh_people']))

    dct[b] = vals

In [9]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_region/ciDATGAN/car_van.csv')

In [10]:
# DATGAN
dct = {}

for b in regions:
    df = pd.read_csv('../../data/DATGAN/{}.csv'.format(b))

    vals = []
    vals.append(np.sum(1/df['hh_people']))
    for i in range(0, 4):
        vals.append(np.sum(1/df[df['hh_carvan'] == i]['hh_people']))
    vals.append(np.sum(1/df[df['hh_carvan'] >= 4]['hh_people']))

    vals.append(np.sum(df['hh_carvan']/df['hh_people']))

    dct[b] = vals

In [11]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_region/DATGAN/car_van.csv')

In [12]:
# oversample
dct = {}

for b in regions:
    df = pd.read_csv('../../data/oversample/{}.csv'.format(b))

    vals = []
    vals.append(np.sum(1/df['hh_people']))
    for i in range(0, 4):
        vals.append(np.sum(1/df[df['hh_carvan'] == i]['hh_people']))
    vals.append(np.sum(1/df[df['hh_carvan'] >= 4]['hh_people']))

    vals.append(np.sum(df['hh_carvan']/df['hh_people']))

    dct[b] = vals

In [13]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_region/oversample/car_van.csv')

# Compare

In [20]:
orig = pd.read_csv('../../data/testing_region/nomis/car_van.csv', index_col=0)

In [21]:
files = [
    '../../data/testing_region/ciDATGAN/car_van.csv',
    '../../data/testing_region/DATGAN/car_van.csv',
    '../../data/testing_region/oversample/car_van.csv'
]

dfs = [pd.read_csv(f, index_col=0) for f in files]
labels = ['ciDATGAN', 'DATGAN', 'Oversample LTDS']

In [22]:
orig = orig/orig.iloc[0]*100

errors = {}

for (df, l) in zip(dfs, labels):
    df = df/df.iloc[0]*100
    diff = np.abs(orig-df)

    errors[l] = np.mean(diff, axis=1)

In [25]:
err = pd.DataFrame(errors)
err = err.drop(err.index[len(err)-1])
err

Unnamed: 0,ciDATGAN,DATGAN,Oversample LTDS
Total,0.0,0.0,0.0
0,14.251139,11.697497,11.880066
1,11.642472,10.224689,10.684633
2,3.037803,2.307669,2.094477
3,0.327124,0.525101,0.546865
4+,0.177579,0.234215,0.35218


In [26]:
err.mean()

ciDATGAN           4.906019
DATGAN             4.164862
Oversample LTDS    4.259703
dtype: float64