In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from tqdm import tqdm
import time


import seaborn as sns
sns.set_style("whitegrid")


import urllib3
import json

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# Update data from nomis

In [8]:
df = pd.read_csv('../../data/testing/nomis/hh_size.csv', index_col=0)

In [9]:
regions = {
    'Central London': ['City of London', 'Camden', 'Kensington and Chelsea', 'Islington', 'Westminster', 'Southwark', 'Lambeth'],
    'South London': ['Bromley', 'Croydon', 'Kingston upon Thames', 'Merton', 'Sutton', 'Wandsworth'],
    'East London': ['Barking and Dagenham', 'Bexley', 'Greenwich', 'Hackney', 'Havering', 'Lewisham', 'Newham', 'Tower Hamlets', 'Waltham Forest'],
    'North London': ['Barnet', 'Enfield', 'Haringey'],
    'West London': ['Brent', 'Ealing', 'Hammersmith and Fulham', 'Harrow', 'Hillingdon', 'Hounslow', 'Redbridge', 'Richmond upon Thames']
}

In [17]:
dct = {}

for r in regions:
    dct[r] = df[regions[r]].sum(axis=1).values

In [19]:
df = pd.DataFrame(dct, index=df.index)

In [22]:
df.to_csv('../../data/testing_region/nomis/hh_size.csv')

# Compute values for synthetic data

In [25]:
regions = ['Central London', 'South London', 'East London', 'North London', 'West London']
idx = ['Total', '1', '2', '3', '4', '5', '6', '7', '8+']

In [26]:
# ciDATGAN
dct = {}

for b in regions:
    df = pd.read_csv('../../data/ciDATGAN/{}.csv'.format(b))

    vals = []
    vals.append(np.sum(1/df['hh_people']))
    for i in range(1, 8):
        vals.append(np.sum(1/df[df['hh_people'] == i]['hh_people']))
    vals.append(np.sum(1/df[df['hh_people'] >= 8]['hh_people']))

    dct[b] = vals

In [27]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_region/ciDATGAN/hh_size.csv')

In [28]:
# DATGAN
dct = {}

for b in regions:
    df = pd.read_csv('../../data/DATGAN/{}.csv'.format(b))

    vals = []
    vals.append(np.sum(1/df['hh_people']))
    for i in range(1, 8):
        vals.append(np.sum(1/df[df['hh_people'] == i]['hh_people']))
    vals.append(np.sum(1/df[df['hh_people'] >= 8]['hh_people']))

    dct[b] = vals

In [29]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_region/DATGAN/hh_size.csv')

In [30]:
# oversample
dct = {}

for b in regions:
    df = pd.read_csv('../../data/oversample/{}.csv'.format(b))

    vals = []
    vals.append(np.sum(1/df['hh_people']))
    for i in range(1, 8):
        vals.append(np.sum(1/df[df['hh_people'] == i]['hh_people']))
    vals.append(np.sum(1/df[df['hh_people'] >= 8]['hh_people']))

    dct[b] = vals

In [31]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_region/oversample/hh_size.csv')

# Compare

In [32]:
orig = pd.read_csv('../../data/testing_region/nomis/hh_size.csv', index_col=0)

In [33]:
files = [
    '../../data/testing_region/ciDATGAN/hh_size.csv',
    '../../data/testing_region/DATGAN/hh_size.csv',
    '../../data/testing_region/oversample/hh_size.csv'
]

dfs = [pd.read_csv(f, index_col=0) for f in files]
labels = ['ciDATGAN', 'DATGAN', 'Oversample LTDS']

In [34]:
orig = orig/orig.iloc[0]*100

In [35]:
errors = {}

for (df, l) in zip(dfs, labels):
    df = df/df.iloc[0]*100
    diff = np.abs(orig-df)

    errors[l] = np.mean(diff, axis=1)

In [36]:
err = pd.DataFrame(errors)

In [37]:
err

Unnamed: 0,ciDATGAN,DATGAN,Oversample LTDS
Total,0.0,0.0,0.0
1,2.484199,4.477453,1.195706
2,2.160386,3.159306,2.978045
3,1.222381,1.797234,0.928194
4,0.474603,1.475071,0.584803
5,0.371089,0.795507,0.554604
6,0.955331,1.001637,0.918131
7,0.167238,0.218237,0.204951
8+,0.336553,0.339498,0.346086


In [38]:
err.mean()

ciDATGAN           0.907975
DATGAN             1.473771
Oversample LTDS    0.856724
dtype: float64