In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from tqdm import tqdm
import time


import seaborn as sns
sns.set_style("whitegrid")


import urllib3
import json

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# Get data from nomis

In [11]:
df = pd.read_csv('../../data/testing/nomis/fam_comp.csv', index_col=0)

In [12]:
regions = {
    'Central London': ['City of London', 'Camden', 'Kensington and Chelsea', 'Islington', 'Westminster', 'Southwark', 'Lambeth'],
    'South London': ['Bromley', 'Croydon', 'Kingston upon Thames', 'Merton', 'Sutton', 'Wandsworth'],
    'East London': ['Barking and Dagenham', 'Bexley', 'Greenwich', 'Hackney', 'Havering', 'Lewisham', 'Newham', 'Tower Hamlets', 'Waltham Forest'],
    'North London': ['Barnet', 'Enfield', 'Haringey'],
    'West London': ['Brent', 'Ealing', 'Hammersmith and Fulham', 'Harrow', 'Hillingdon', 'Hounslow', 'Redbridge', 'Richmond upon Thames']
}

In [13]:
dct = {}

for r in regions:
    dct[r] = df[regions[r]].sum(axis=1).values

In [14]:
df = pd.DataFrame(dct, index=df.index)

In [15]:
df.to_csv('../../data/testing_region/nomis/fam_comp.csv')

# Compute values for synthetic data

In [2]:
regions = ['Central London', 'South London', 'East London', 'North London', 'West London']
idx = ['Total', 'Single', 'Couple family', 'Lone parent', 'Other']

In [3]:
# ciDATGAN
dct = {}

for b in regions:
    df = pd.read_csv('../../data/ciDATGAN/{}.csv'.format(b))

    vals = [0]

    for c in idx[1:]:
        vals.append(np.sum(1/df[df['hh_comp'] == c]['hh_people']))

    vals[0] = np.sum(vals[1:])

    dct[b] = vals

In [4]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_region/ciDATGAN/fam_comp.csv')

In [5]:
# DATGAN
dct = {}

for b in regions:
    df = pd.read_csv('../../data/DATGAN/{}.csv'.format(b))

    vals = [0]

    for c in idx[1:]:
        vals.append(np.sum(1/df[df['hh_comp'] == c]['hh_people']))

    vals[0] = np.sum(vals[1:])

    dct[b] = vals

In [6]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_region/DATGAN/fam_comp.csv')

In [7]:
# oversample
dct = {}

for b in regions:
    df = pd.read_csv('../../data/oversample/{}.csv'.format(b))

    vals = [0]

    for c in idx[1:]:
        vals.append(np.sum(1/df[df['hh_comp'] == c]['hh_people']))

    vals[0] = np.sum(vals[1:])

    dct[b] = vals

In [8]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_region/oversample/fam_comp.csv')

# Compare

In [16]:
orig = pd.read_csv('../../data/testing_region/nomis/fam_comp.csv', index_col=0)

In [17]:
files = [
    '../../data/testing_region/ciDATGAN/fam_comp.csv',
    '../../data/testing_region/DATGAN/fam_comp.csv',
    '../../data/testing_region/oversample/fam_comp.csv'
]

dfs = [pd.read_csv(f, index_col=0) for f in files]
labels = ['ciDATGAN', 'DATGAN', 'Oversample LTDS']

In [18]:
orig = orig/orig.iloc[0]*100

In [19]:
errors = {}

for (df, l) in zip(dfs, labels):
    df = df/df.iloc[0]*100
    diff = np.abs(orig-df)

    errors[l] = np.mean(diff, axis=1)

In [20]:
err = pd.DataFrame(errors)
err

Unnamed: 0,ciDATGAN,DATGAN,Oversample LTDS
Total,0.0,0.0,0.0
Single,3.047033,4.189346,1.241999
Couple family,12.869149,11.831018,12.124618
Lone parent,2.554695,1.899944,1.989063
Other,12.007357,12.521151,12.915923


In [21]:
err.mean()

ciDATGAN           6.095647
DATGAN             6.088292
Oversample LTDS    5.654320
dtype: float64