In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from tqdm import tqdm
import time


import seaborn as sns
sns.set_style("whitegrid")


import urllib3
import json

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# Update data from nomis

In [2]:
df = pd.read_csv('../../data/testing/nomis/ethnicity.csv', index_col=0)

In [3]:
regions = {
    'Central London': ['City of London', 'Camden', 'Kensington and Chelsea', 'Islington', 'Westminster', 'Southwark', 'Lambeth'],
    'South London': ['Bromley', 'Croydon', 'Kingston upon Thames', 'Merton', 'Sutton', 'Wandsworth'],
    'East London': ['Barking and Dagenham', 'Bexley', 'Greenwich', 'Hackney', 'Havering', 'Lewisham', 'Newham', 'Tower Hamlets', 'Waltham Forest'],
    'North London': ['Barnet', 'Enfield', 'Haringey'],
    'West London': ['Brent', 'Ealing', 'Hammersmith and Fulham', 'Harrow', 'Hillingdon', 'Hounslow', 'Redbridge', 'Richmond upon Thames']
}

In [4]:
dct = {}

for r in regions:
    dct[r] = df[regions[r]].sum(axis=1).values

In [5]:
df = pd.DataFrame(dct, index=df.index)

In [6]:
df.to_csv('../../data/testing_region/nomis/ethnicity.csv')

# Compute values for synthetic data

In [7]:
regions = ['Central London', 'South London', 'East London', 'North London', 'West London']
idx = ['Total', 'White', 'Mixed', 'Asian', 'Black', 'Other']

In [8]:
# ciDATGAN
dct = {}

for b in regions:
    df = pd.read_csv('../../data/ciDATGAN/{}.csv'.format(b))

    vals = [len(df)]
    for i in idx[1:]:
        vals.append(len(df[df['ethnicity'] == i]))

    dct[b] = vals

In [9]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_region/ciDATGAN/ethnicity.csv')

In [10]:
# DATGAN
dct = {}

for b in regions:
    df = pd.read_csv('../../data/DATGAN/{}.csv'.format(b))

    vals = [len(df)]
    for i in idx[1:]:
        vals.append(len(df[df['ethnicity'] == i]))

    dct[b] = vals

In [11]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_region/DATGAN/ethnicity.csv')

In [12]:
# oversample
dct = {}

for b in regions:
    df = pd.read_csv('../../data/oversample/{}.csv'.format(b))

    vals = [len(df)]
    for i in idx[1:]:
        vals.append(len(df[df['ethnicity'] == i]))

    dct[b] = vals

In [13]:
df = pd.DataFrame(dct, index=idx)
df.to_csv('../../data/testing_region/oversample/ethnicity.csv')

# Compare

In [14]:
orig = pd.read_csv('../../data/testing_region/nomis/ethnicity.csv', index_col=0)

In [15]:
files = [
    '../../data/testing_region/ciDATGAN/ethnicity.csv',
    '../../data/testing_region/DATGAN/ethnicity.csv',
    '../../data/testing_region/oversample/ethnicity.csv'
]

dfs = [pd.read_csv(f, index_col=0) for f in files]
labels = ['ciDATGAN', 'DATGAN', 'Oversample LTDS']

In [16]:
orig = orig/orig.iloc[0]*100

In [17]:
errors = {}

for (df, l) in zip(dfs, labels):
    df = df/df.iloc[0]*100
    diff = np.abs(orig-df)

    errors[l] = np.mean(diff, axis=1)

In [18]:
err = pd.DataFrame(errors)
err

Unnamed: 0,ciDATGAN,DATGAN,Oversample LTDS
Total,0.0,0.0,0.0
White,7.526323,6.897417,6.634293
Mixed,1.823613,1.571546,1.764333
Asian,5.648841,5.672969,2.087186
Black,3.084607,3.003653,1.859263
Other,1.578375,1.407688,1.152786


In [19]:
err.mean()

ciDATGAN           3.276960
DATGAN             3.092212
Oversample LTDS    2.249644
dtype: float64