In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from tqdm import tqdm
import time


import seaborn as sns
sns.set_style("whitegrid")

from scipy.spatial.distance import jensenshannon

import urllib3
import json

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# Update data from nomis

In [2]:
df = pd.read_csv('../../data/testing/nomis/hh_size.csv', index_col=0)

In [3]:
boroughs = ['City of London', 'Westminster', 'Kingston upon Thames', 'Bromley', 'Greenwich', 'Havering', 'Barnet', 'Enfield', 'Hillingdon', 'Brent']

In [4]:
dct = {}

for b in boroughs:
    dct[b] = df[b].values

In [5]:
idx = ['Total', '1', '2', '3', '4', '5', '6', '7', '8+']

In [6]:
df = pd.DataFrame(dct, index=idx)

In [7]:
df.to_csv('../../data/testing_bias/nomis/hh_size.csv')

# Compute values for synthetic data

In [8]:
boroughs = ['City of London', 'Westminster', 'Kingston upon Thames', 'Bromley', 'Greenwich', 'Havering', 'Barnet', 'Enfield', 'Hillingdon', 'Brent']
idx = ['Total', '1', '2', '3', '4', '5', '6', '7', '8+']

In [9]:
# ciDATGAN
dct = {}

for b in boroughs:
    df = pd.read_csv('../../data/ciDATGAN_bias/{}.csv'.format(b))

    vals = []
    vals.append(np.sum(1/df['hh_people']))
    for i in range(1, 8):
        vals.append(np.sum(1/df[df['hh_people'] == i]['hh_people']))
    vals.append(np.sum(1/df[df['hh_people'] >= 8]['hh_people']))

    dct[b] = vals

In [10]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_bias/ciDATGAN/hh_size.csv')

In [11]:
# DATGAN
dct = {}

for b in boroughs:
    df = pd.read_csv('../../data/DATGAN_bias/{}.csv'.format(b))

    vals = []
    vals.append(np.sum(1/df['hh_people']))
    for i in range(1, 8):
        vals.append(np.sum(1/df[df['hh_people'] == i]['hh_people']))
    vals.append(np.sum(1/df[df['hh_people'] >= 8]['hh_people']))

    dct[b] = vals

In [12]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_bias/DATGAN/hh_size.csv')

In [13]:
# oversample
dct = {}

for b in boroughs:
    df = pd.read_csv('../../data/oversample_bias/{}.csv'.format(b))

    vals = []
    vals.append(np.sum(1/df['hh_people']))
    for i in range(1, 8):
        vals.append(np.sum(1/df[df['hh_people'] == i]['hh_people']))
    vals.append(np.sum(1/df[df['hh_people'] >= 8]['hh_people']))

    dct[b] = vals

In [14]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_bias/oversample/hh_size.csv')

# Compare

In [2]:
orig = pd.read_csv('../../data/testing_bias/nomis/hh_size.csv', index_col=0)

In [3]:
files = [
    '../../data/testing_bias/ciDATGAN/hh_size.csv',
    '../../data/testing_bias/DATGAN/hh_size.csv',
    '../../data/testing_bias/oversample/hh_size.csv'
]

dfs = [pd.read_csv(f, index_col=0) for f in files]
labels = ['ciDATGAN', 'DATGAN', 'Oversample LTDS']

In [4]:
orig = orig/orig.iloc[0]
orig = orig.iloc[1:-1]

errors = {}

for (df, l) in zip(dfs, labels):
    df = df/df.iloc[0]
    df = df.iloc[1:-1]

    errors[l] = []

    for c in orig.columns:
        errors[l].append(jensenshannon(df[c], orig[c]))

In [5]:
for l in labels:
    print('{}: {:.2e} \pm {:.2e}'.format(l, np.mean(errors[l]), np.std(errors[l])))

ciDATGAN: 6.05e-02 \pm 4.80e-02
DATGAN: 7.39e-02 \pm 3.16e-02
Oversample LTDS: 7.13e-02 \pm 3.35e-02
