In [None]:
from datgan import DATGAN
import datgan

import numpy as np
import pandas as pd
import networkx as nx

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import tensorflow as tf

#tf.config.run_functions_eagerly(True)

In [None]:
df = pd.read_csv('../../data/LTDS/cleaned_bias.csv')

In [None]:
df.head()

In [None]:
data_info = {
    'age': {
        'type': 'continuous',
        'bounds': [0, 100],
        'discrete': True
    }
}

# Add the other variables as categorical
for c in df.columns:
    if c not in data_info.keys():
        data_info[c] = {'type': 'categorical'}

In [None]:
graph = nx.DiGraph()

graph.add_edges_from([
    ('age', 'hh_comp'),
    ('gender', 'hh_comp'),
    ('ethnicity', 'hh_people'),
    ('hh_borough', 'hh_comp'),
    ('hh_borough', 'hh_carvan'),
    ('hh_borough', 'hh_income'),
    ('hh_comp', 'hh_income'),
    ('hh_comp', 'hh_people'),
    ('hh_income', 'hh_carvan'),
    ('hh_people', 'hh_carvan')
])

In [None]:
datgan.advise(df, graph, plot_graphs=True)

In [None]:
name = 'DATGAN_bias'

In [None]:
output_folder = './output/{}/'.format(name)

In [None]:
datgan = DATGAN(output=output_folder,
                loss_function='WGAN',
                batch_size=1878,
                num_epochs=1000)

In [None]:
datgan.fit(df, data_info, graph, preprocessed_data_path='./output/encoded_bias')

In [None]:
samp = datgan.sample(len(df))
samp.to_csv('../../data/synthetic/DATGAN_bias.csv', index=False)

# Sampling

In [None]:
boroughs = ['City of London', 'Westminster', 'Kingston upon Thames', 'Bromley', 'Greenwich', 'Havering', 'Barnet', 'Enfield', 'Hillingdon', 'Brent']

In [None]:
nbrs = {}
dct = {}

for r in boroughs:
    tmp = pd.read_csv('../../data/nomis/{}.csv'.format(r))

    nbrs[r] = len(tmp)
    dct[r] = []

In [None]:
remaining_boroughs = set(boroughs)

count = 1
while remaining_boroughs:

    print("Pass {} - Remaining boroughs: {}".format(count, len(remaining_boroughs)))

    samp = datgan.sample(100000)

    region_to_remove = []

    for r in remaining_boroughs:
        tmp = samp[samp.hh_borough == r]

        if len(tmp) > nbrs[r]:
            tmp = tmp.sample(nbrs[r], replace=False)

        nbrs[r] -= len(tmp)
        dct[r].append(tmp)

        if nbrs[r] == 0:
            region_to_remove.append(r)


    for r in region_to_remove:
        remaining_boroughs.remove(r)

    count += 1

In [None]:
for r in dct.keys():
    tmp = pd.concat(dct[r])
    tmp.to_csv('../../data/DATGAN_bias/{}.csv'.format(r), index=False)

# Sampling and correcting for all boroughs

In [None]:
boroughs = ['City of London', 'Westminster', 'Kingston upon Thames', 'Bromley', 'Greenwich', 'Havering', 'Barnet', 'Enfield', 'Hillingdon', 'Brent']

In [None]:
%%time
nbrs = {}
dct = {}

for r in boroughs:
    tot = 0

    tmp = pd.read_csv('../../data/nomis/{}.csv'.format(r))

    nbrs[r] = tmp.groupby(['age','gender']).size().unstack(fill_value=0)

    dct[r] = []

In [None]:
%%time
remaining_boroughs = set(boroughs)

count = 1
while remaining_boroughs:

    print("Pass {} - Remaining boroughs: {}".format(count, len(remaining_boroughs)))

    samp = datgan.sample(1000000)

    region_to_remove = []

    for r in remaining_boroughs:
        tmp = samp[samp.hh_borough == r]

        grps = tmp.groupby(['age','gender']).size().unstack(fill_value=0)
        
        for g in grps.columns:
            for a in grps.index:
                if nbrs[r][g].loc[a] > 0:
                    if grps[g].loc[a] <= nbrs[r][g].loc[a]:
                        dct[r].append(tmp[(tmp['gender'] == g) & (tmp['age'] == a)])
                        nbrs[r][g].loc[a] = nbrs[r][g].loc[a] - grps[g].loc[a]
                    else:
                        small = tmp[(tmp['gender'] == g) & (tmp['age'] == a)].sample(nbrs[r][g].loc[a], replace=False)
                        nbrs[r][g].loc[a] = 0
                        dct[r].append(small)

        if nbrs[r].sum().sum() == 0:
            region_to_remove.append(r)

    for r in region_to_remove:
        remaining_boroughs.remove(r)

    count += 1

In [None]:
nbrs['City of London']