In [1]:
import matplotlib.colors
import pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from tqdm import tqdm

import seaborn as sns
sns.set_style("whitegrid")

import urllib3
import os
import matplotlib

from sklearn.preprocessing import LabelEncoder

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [35]:
!pip install --upgrade --force urllib3==1.24

Collecting urllib3==1.24
  Using cached urllib3-1.24-py2.py3-none-any.whl (117 kB)
Installing collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.26.9
    Uninstalling urllib3-1.26.9:
      Successfully uninstalled urllib3-1.26.9
Successfully installed urllib3-1.24


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
twine 3.8.0 requires urllib3>=1.26.0, but you have urllib3 1.24 which is incompatible.


In [29]:
# Check correlations between borough and the different stuff below
df = pd.read_csv('../../data/LPMC/trips.csv')

for c in df.columns:

    if df[c].dtype == 'object':
        # Change to numerical
        lbl = LabelEncoder()
        df[c] = lbl.fit_transform(df[c])

In [30]:
df['hh_borough'].corr(df['age'])

-0.04244889152136924

In [31]:
df['hh_borough'].corr(df['hh_income'])

0.014023573969732632

In [32]:
df['hh_borough'].corr(df['hh_vehicles'])

-0.09863736077428918

In [33]:
df['hh_borough'].corr(df['distance'])

-0.04005688927724681

In [34]:
df['hh_borough'].corr(df['travel_mode'])

0.05255486567358189

In [2]:
http = urllib3.PoolManager()

In [3]:
regions = {
    'Camden': 1946157246,
    'City of London': 1946157247,
    'Hackney': 1946157248,
    'Haringey': 1946157250,
    'Islington': 1946157251,
    'Kensington and Chelsea': 1946157252,
    'Lambeth': 1946157253,
    'Lewisham': 1946157254,
    'Newham': 1946157255,
    'Southwark': 1946157256,
    'Tower Hamlets': 1946157257,
    'Wandsworth': 1946157258,
    'Westminster': 1946157259,
    'Barking and Dagenham': 1946157260,
    'Barnet': 1946157261,
    'Bexley': 1946157262,
    'Brent': 1946157263,
    'Bromley': 1946157264,
    'Croydon': 1946157265,
    'Ealing': 1946157266,
    'Enfield': 1946157267,
    'Greenwich': 1946157268,
    'Harrow': 1946157269,
    'Havering': 1946157270,
    'Hillingdon': 1946157271,
    'Hounslow': 1946157272,
    'Kingston upon Thames': 1946157273,
    'Merton': 1946157274,
    'Redbridge': 1946157275,
    'Richmond upon Thames': 1946157276,
    'Sutton': 1946157277,
    'Waltham Forest': 1946157278,
    'Hammersmith & Fulham': 1946157249,
}

In [4]:
def compute_stats(freq_list_orig, freq_list_synth):
    """
    Compute different statistics (MAE, RMSE, SMRSE, R^2, and Pearson's correlation) on two frequency lists.

    Parameters
    ----------
    freq_list_orig: numpy.ndarray
        Frequency list for the original data
    freq_list_synth: numpy.ndarray
        Frequency list for the synthetic data

    Returns
    -------
    stat: dict
        Dictionary of the stats between the two lists
    """

    freq_list_orig, freq_list_synth = np.array(freq_list_orig), np.array(freq_list_synth)
    corr_mat = np.corrcoef(freq_list_orig, freq_list_synth)
    corr = corr_mat[0, 1]
    if np.isnan(corr): corr = 0.0
    # MAE
    mae = np.absolute(freq_list_orig - freq_list_synth).mean()
    # RMSE
    rmse = np.linalg.norm(freq_list_orig - freq_list_synth) / np.sqrt(len(freq_list_orig))
    # SRMSE
    freq_list_orig_avg = freq_list_orig.mean()
    srmse = rmse / freq_list_orig_avg
    # r-square
    u = np.sum((freq_list_synth - freq_list_orig) ** 2)
    v = np.sum((freq_list_orig - freq_list_orig_avg) ** 2)
    r2 = 1.0 - u / v
    stat = {'mae': mae, 'rmse': rmse, 'r2': r2, 'srmse': srmse, 'corr': corr}

    return stat

stats_str = ['mae', 'rmse', 'r2', 'srmse', 'corr']
labels = ['NOMIS', 'LPMC', 'DATGAN', 'ciDATGAN']
colrs = ['black', 'lightgrey', 'darkgrey', 'red']

colors = [matplotlib.colors.to_rgba(c) for c in colrs]

# Age

In [5]:
bins = [-np.inf, 16, 30, 45, 65, np.inf]
bin_labels = ['child', 'young', 'adult', 'old', 'senior']

if not os.path.exists('../../figures/age/'):
    os.makedirs('../../figures/age/')

In [6]:
res = {
    'ciDATGAN': {},
    'LPMC': {},
    'DATGAN': {}
}

for s in stats_str:
    res['ciDATGAN'][s] = []
    res['LPMC'][s] = []
    res['DATGAN'][s] = []

for r in tqdm(regions):

    # Nomis data
    response = http.request("GET", "https://www.nomisweb.co.uk/api/v01/dataset/NM_503_1.data.csv?geography={}&rows=c_age&cols=rural_urban&measures=20100".format(regions[r]))

    with open('tmp.txt', 'w') as f:
        f.write(response.data.decode('utf-8'))

    df = pd.read_csv('tmp.txt')

    df = df.sort_values('C_AGE')
    df = df.iloc[1:, :]
    df.index = list(range(len(df)))

    df = df['GROUP 1: OBS_VALUE']

    dct = {'age': [df.iloc[:17].sum(), df.iloc[17:31].sum(), df.iloc[31:46].sum(), df.iloc[46:66].sum(), df.iloc[66:].sum()]}

    nomis = pd.DataFrame(dct, index=bin_labels)
    nomis = nomis['age']
    nomis /= nomis.sum()

    # Oversampled LPMC data
    lpmc = pd.read_csv('../../data/synthetic/LPMC/{}.csv'.format(r))
    lpmc['age'] = pd.cut(lpmc['age'], bins=bins, labels=bin_labels)
    lpmc = lpmc['age'].value_counts()
    lpmc = lpmc.sort_index()
    lpmc /= lpmc.sum()

    # Rejection sampling DATGAN
    datgan = pd.read_csv('../../data/synthetic/DATGAN/{}.csv'.format(r))
    datgan['age'] = pd.cut(datgan['age'], bins=bins, labels=bin_labels)
    datgan = datgan['age'].value_counts()
    datgan = datgan.sort_index()
    datgan /= datgan.sum()

    # ciDATGAN
    cidatgan = pd.read_csv('../../data/synthetic/ciDATGAN/{}.csv'.format(r))
    cidatgan['age'] = pd.cut(cidatgan['age'], bins=bins, labels=bin_labels)
    cidatgan = cidatgan['age'].value_counts()
    cidatgan = cidatgan.sort_index()
    cidatgan /= cidatgan.sum()

    # Histogram
    probs = [nomis, lpmc, datgan, cidatgan]

    dct = {}
    for l in labels:
        dct[l] = []

    idx = probs[0].index

    for i in idx:
        for j, l in enumerate(labels):
            try:
                val = probs[j].loc[i]
            except:
                val = 0
            dct[l].append(val)

    tmp_df = pd.DataFrame(dct, index=idx)

    tmp_df.plot(figsize=(10,7), kind='bar', color=colors)

    plt.legend()
    plt.xticks(rotation=45)
    plt.title('Age distribution for {}'.format(r))

    plt.savefig('../../figures/age/{}.png'.format(r), bbox_inches='tight')
    plt.close()

    # Compute the stats
    res1 = compute_stats(nomis, lpmc)
    res2 = compute_stats(nomis, datgan)
    res3 = compute_stats(nomis, cidatgan)
    for s in stats_str:
        res['LPMC'][s].append(res1[s])
        res['DATGAN'][s].append(res2[s])
        res['ciDATGAN'][s].append(res3[s])

100%|██████████| 33/33 [01:22<00:00,  2.51s/it]


In [7]:
for s in stats_str:
    tmp = [res['LPMC'][s], res['DATGAN'][s], res['ciDATGAN'][s]]
    df = pd.DataFrame(tmp, index=['LPMC', 'DATGAN', 'ciDATGAN'])

    plt.figure(figsize=(10,7))

    sns.violinplot(data=df.T, palette=colors[1:])

    plt.xticks([0,1,2], ['LPMC', 'DATGAN', 'ciDATGAN'])

    plt.ylabel('{}'.format(s.upper()))

    plt.savefig('../../figures/age/age_{}.png'.format(s), bbox_inches='tight')
    plt.savefig('../../figures/age/age_{}.pdf'.format(s), bbox_inches='tight')
    plt.close()

## Number of people in household

In [8]:
if not os.path.exists('../../figures/hh_people/'):
    os.makedirs('../../figures/hh_people/')

In [9]:
res = {
    'ciDATGAN': {},
    'LPMC': {},
    'DATGAN': {}
}

for s in stats_str:
    res['ciDATGAN'][s] = []
    res['LPMC'][s] = []
    res['DATGAN'][s] = []

for r in tqdm(regions):

    # Nomis data
    url = 'https://www.nomisweb.co.uk/api/v01/dataset/NM_538_1.data.csv?geography={}&rows=cell&cols=rural_urban&measures=20100'.format(regions[r])
    response = http.request('GET', url)

    with open('tmp.txt', 'w') as f:
        f.write(response.data.decode('utf-8'))

    df = pd.read_csv('tmp.txt')

    nomis = list(df['GROUP 1: OBS_VALUE'].iloc[1:])
    nomis = np.array(nomis)
    nomis = nomis/sum(nomis)

    # Oversampled LPMC data
    lpmc = pd.read_csv('../../data/synthetic/LPMC/{}.csv'.format(r))
    tmp = lpmc['hh_people'].value_counts().sort_index(ascending=True)
    lpmc = np.zeros(len(nomis))
    for i in tmp.index:
        if i-1 >= len(lpmc):
            lpmc[len(lpmc)-1] += tmp[i]
        else:
            lpmc[i-1] = tmp[i]
    lpmc /= lpmc.sum()

    # Rejection sampling DATGAN
    datgan = pd.read_csv('../../data/synthetic/DATGAN/{}.csv'.format(r))
    tmp = datgan['hh_people'].value_counts().sort_index(ascending=True)
    datgan = np.zeros(len(nomis))
    for i in tmp.index:
        if i-1 >= len(datgan):
            datgan[len(datgan)-1] += tmp[i]
        else:
            datgan[i-1] = tmp[i]
    datgan /= datgan.sum()

    # ciDATGAN
    cidatgan = pd.read_csv('../../data/synthetic/ciDATGAN/{}.csv'.format(r))
    tmp = cidatgan['hh_people'].value_counts().sort_index(ascending=True)
    cidatgan = np.zeros(len(nomis))
    for i in tmp.index:
        if i-1 >= len(cidatgan):
            cidatgan[len(cidatgan)-1] += tmp[i]
        else:
            cidatgan[i-1] = tmp[i]
    cidatgan /= cidatgan.sum()

    # Histogram
    probs = [nomis, lpmc, datgan, cidatgan]

    dct = {}
    for l in labels:
        dct[l] = []

    idx = range(1, len(nomis)+1)

    for i in idx:
        for j, l in enumerate(labels):
            try:
                val = probs[j][i-1]
            except:
                val = 0
            dct[l].append(val)

    tmp_df = pd.DataFrame(dct, index=idx)

    tmp_df.plot(figsize=(10,7), kind='bar', color=colors)

    plt.legend()
    plt.xticks(rotation=45)
    plt.title('Distribution of people per household for {}'.format(r))

    plt.savefig('../../figures/hh_people/{}.png'.format(r), bbox_inches='tight')
    plt.close()

    # Compute the stats
    res1 = compute_stats(nomis, lpmc)
    res2 = compute_stats(nomis, datgan)
    res3 = compute_stats(nomis, cidatgan)
    for s in stats_str:
        res['LPMC'][s].append(res1[s])
        res['DATGAN'][s].append(res2[s])
        res['ciDATGAN'][s].append(res3[s])

100%|██████████| 33/33 [00:57<00:00,  1.73s/it]


In [10]:
for s in stats_str:
    tmp = [res['LPMC'][s], res['DATGAN'][s], res['ciDATGAN'][s]]
    df = pd.DataFrame(tmp, index=['LPMC', 'DATGAN', 'ciDATGAN'])

    plt.figure(figsize=(10,7))

    sns.violinplot(data=df.T, palette=colors[1:])

    plt.xticks([0,1,2], ['LPMC', 'DATGAN', 'ciDATGAN'])

    plt.ylabel('{}'.format(s.upper()))

    plt.savefig('../../figures/hh_people/hh_people_{}.png'.format(s), bbox_inches='tight')
    plt.savefig('../../figures/hh_people/hh_people_{}.pdf'.format(s), bbox_inches='tight')
    plt.close()

## Distance travelled to work

In [11]:
bins = [-np.inf, 2000, 5000, 10000, 20000, 30000, 40000, 60000, np.inf]
bin_labels = ['<2km', '2-5km', '5-10km', '10-20km', '20-30km', '30-40km', '40-60km', '>60km']

if not os.path.exists('../../figures/dist_work/'):
    os.makedirs('../../figures/dist_work/')

In [12]:
res = {
    'ciDATGAN': {},
    'LPMC': {},
    'DATGAN': {}
}

for s in stats_str:
    res['ciDATGAN'][s] = []
    res['LPMC'][s] = []
    res['DATGAN'][s] = []

for r in tqdm(regions):

    # Get the data
    url = 'https://www.nomisweb.co.uk/api/v01/dataset/NM_153_1.data.csv?geography={}&rows=cell&cols=rural_urban&measures=20100'.format(regions[r])
    response = http.request('GET', url)

    with open('tmp.txt', 'w') as f:
        f.write(response.data.decode('utf-8'))

    df = pd.read_csv('tmp.txt')
    df = df.sort_values('CELL')
    df.index = df['CELL']

    nomis = list(df['GROUP 1: OBS_VALUE'].iloc[1:9])
    nomis = pd.Series(nomis, index=bin_labels)
    nomis /= nomis.sum()

    # Oversampled LPMC data
    lpmc = pd.read_csv('../../data/synthetic/LPMC/{}.csv'.format(r))
    lpmc['distance'] = pd.cut(lpmc['distance'], bins=bins, labels=bin_labels)
    lpmc = lpmc[lpmc['purpose'] == 'HBW']
    lpmc = lpmc[((lpmc['age'] >= 16) & (lpmc['age'] <= 74))]
    lpmc = lpmc['distance'].value_counts()
    lpmc = lpmc
    lpmc = lpmc.sort_index()
    lpmc /= lpmc.sum()

    # Rejection sampling DATGAN
    datgan = pd.read_csv('../../data/synthetic/DATGAN/{}.csv'.format(r))
    datgan['distance'] = pd.cut(datgan['distance'], bins=bins, labels=bin_labels)
    datgan = datgan[datgan['purpose'] == 'HBW']
    datgan = datgan[((datgan['age'] >= 16) & (datgan['age'] <= 74))]
    datgan = datgan['distance'].value_counts()
    datgan = datgan
    datgan = datgan.sort_index()
    datgan /= datgan.sum()

    # ciDATGAN
    cidatgan = pd.read_csv('../../data/synthetic/ciDATGAN/{}.csv'.format(r))
    cidatgan['distance'] = pd.cut(cidatgan['distance'], bins=bins, labels=bin_labels)
    cidatgan = cidatgan[cidatgan['purpose'] == 'HBW']
    cidatgan = cidatgan[((cidatgan['age'] >= 16) & (cidatgan['age'] <= 74))]
    cidatgan = cidatgan['distance'].value_counts()
    cidatgan = cidatgan
    cidatgan = cidatgan.sort_index()
    cidatgan /= cidatgan.sum()

    # Histogram
    probs = [nomis, lpmc, datgan, cidatgan]

    dct = {}
    for l in labels:
        dct[l] = []

    idx = probs[0].index

    for i in idx:
        for j, l in enumerate(labels):
            try:
                val = probs[j].loc[i]
            except:
                val = 0
            dct[l].append(val)

    tmp_df = pd.DataFrame(dct, index=idx)

    tmp_df.plot(figsize=(10,7), kind='bar', color=colors)

    plt.legend()
    plt.xticks(rotation=45)
    plt.title('Distribution for the distance to work for {}'.format(r))

    plt.savefig('../../figures/dist_work/{}.png'.format(r), bbox_inches='tight')
    plt.close()

    # Compute the stats
    res1 = compute_stats(nomis, lpmc)
    res2 = compute_stats(nomis, datgan)
    res3 = compute_stats(nomis, cidatgan)
    for s in stats_str:
        res['LPMC'][s].append(res1[s])
        res['DATGAN'][s].append(res2[s])
        res['ciDATGAN'][s].append(res3[s])

100%|██████████| 33/33 [01:01<00:00,  1.86s/it]


In [13]:
for s in stats_str:
    tmp = [res['LPMC'][s], res['DATGAN'][s], res['ciDATGAN'][s]]
    df = pd.DataFrame(tmp, index=['LPMC', 'DATGAN', 'ciDATGAN'])

    plt.figure(figsize=(10,7))

    sns.violinplot(data=df.T, palette=colors[1:])

    plt.xticks([0,1,2], ['LPMC', 'DATGAN', 'ciDATGAN'])

    plt.ylabel('{}'.format(s.upper()))

    plt.savefig('../../figures/dist_work/dist_work_{}.png'.format(s), bbox_inches='tight')
    plt.savefig('../../figures/dist_work/dist_work_{}.pdf'.format(s), bbox_inches='tight')
    plt.close()

## Method of travel to work

In [14]:
order = ['pt', 'drive', 'walk', 'cycle']

if not os.path.exists('../../figures/mode_work/'):
    os.makedirs('../../figures/mode_work/')

In [15]:
res = {
    'ciDATGAN': {},
    'LPMC': {},
    'DATGAN': {}
}

for s in stats_str:
    res['ciDATGAN'][s] = []
    res['LPMC'][s] = []
    res['DATGAN'][s] = []

for r in tqdm(regions):

    # Get the data
    url = 'https://www.nomisweb.co.uk/api/v01/dataset/NM_568_1.data.csv?geography={}&rows=cell&cols=rural_urban&measures=20100'.format(regions[r])
    response = http.request('GET', url)

    with open('tmp.txt', 'w') as f:
        f.write(response.data.decode('utf-8'))

    df = pd.read_csv('tmp.txt')
    df = df.sort_values('CELL')
    df.index = df['CELL']

    values = df['GROUP 1: OBS_VALUE']
    drive = np.sum(values[6:9])
    pt = np.sum(values[2:6])
    walk = values[10]
    cycle = values[9]

    nomis = pd.Series([pt, drive, walk, cycle], index=order)
    nomis = nomis.sort_index()
    nomis /= nomis.sum()

    # Oversampled LPMC data
    lpmc = pd.read_csv('../../data/synthetic/LPMC/{}.csv'.format(r))
    lpmc = lpmc[lpmc['purpose'] == 'HBW']
    lpmc = lpmc[((lpmc['age'] >= 16) & (lpmc['age'] <= 74))]
    lpmc = lpmc['travel_mode'].value_counts()
    for i in order:
        if i not in lpmc.index:
            lpmc[i] = 0

    lpmc = lpmc.sort_index()
    lpmc /= lpmc.sum()

    # Rejection sampling DATGAN
    datgan = pd.read_csv('../../data/synthetic/DATGAN/{}.csv'.format(r))
    datgan = datgan[datgan['purpose'] == 'HBW']
    datgan = datgan[((datgan['age'] >= 16) & (datgan['age'] <= 74))]
    datgan = datgan['travel_mode'].value_counts()
    datgan = datgan.sort_index()
    datgan /= datgan.sum()

    # ciDATGAN
    cidatgan = pd.read_csv('../../data/synthetic/ciDATGAN/{}.csv'.format(r))
    cidatgan = cidatgan[cidatgan['purpose'] == 'HBW']
    cidatgan = cidatgan[((cidatgan['age'] >= 16) & (cidatgan['age'] <= 74))]
    cidatgan = cidatgan['travel_mode'].value_counts()
    cidatgan = cidatgan.sort_index()
    cidatgan /= cidatgan.sum()

    # Histogram
    probs = [nomis, lpmc, datgan, cidatgan]

    dct = {}
    for l in labels:
        dct[l] = []

    idx = probs[0].index

    for i in idx:
        for j, l in enumerate(labels):
            try:
                val = probs[j].loc[i]
            except:
                val = 0
            dct[l].append(val)

    tmp_df = pd.DataFrame(dct, index=idx)

    tmp_df.plot(figsize=(10,7), kind='bar', color=colors)

    plt.legend()
    plt.xticks(rotation=45)
    plt.title('Distribution for model to work for {}'.format(r))

    plt.savefig('../../figures/mode_work/{}.png'.format(r), bbox_inches='tight')
    plt.close()

    # Compute the stats
    res1 = compute_stats(nomis, lpmc)
    res2 = compute_stats(nomis, datgan)
    res3 = compute_stats(nomis, cidatgan)
    for s in stats_str:
        res['LPMC'][s].append(res1[s])
        res['DATGAN'][s].append(res2[s])
        res['ciDATGAN'][s].append(res3[s])

100%|██████████| 33/33 [01:00<00:00,  1.82s/it]


In [16]:
for s in stats_str:
    tmp = [res['LPMC'][s], res['DATGAN'][s], res['ciDATGAN'][s]]
    df = pd.DataFrame(tmp, index=['LPMC', 'DATGAN', 'ciDATGAN'])

    plt.figure(figsize=(10,7))

    sns.violinplot(data=df.T, palette=colors[1:])

    plt.xticks([0,1,2], ['LPMC', 'DATGAN', 'ciDATGAN'])

    plt.ylabel('{}'.format(s.upper()))

    plt.savefig('../../figures/mode_work/mode_work_{}.png'.format(s), bbox_inches='tight')
    plt.savefig('../../figures/mode_work/mode_work_{}.pdf'.format(s), bbox_inches='tight')

    plt.close()

## Number of cars in the household

In [17]:
if not os.path.exists('../../figures/hh_vehicles/'):
    os.makedirs('../../figures/hh_vehicles/')

In [18]:
res = {
    'ciDATGAN': {},
    'LPMC': {},
    'DATGAN': {}
}

for s in stats_str:
    res['ciDATGAN'][s] = []
    res['LPMC'][s] = []
    res['DATGAN'][s] = []

for r in tqdm(regions):

    # Get the data
    url = 'https://www.nomisweb.co.uk/api/v01/dataset/NM_621_1.data.csv?geography={}&rows=cell&cols=rural_urban&measures=20100'.format(regions[r])
    response = http.request('GET', url)

    with open('tmp.txt', 'w') as f:
        f.write(response.data.decode('utf-8'))

    df = pd.read_csv('tmp.txt')

    nomis = df['GROUP 1: OBS_VALUE'].iloc[1:6]
    nomis = np.array(nomis)
    nomis = nomis/nomis.sum()

    # Oversampled LPMC data
    lpmc = pd.read_csv('../../data/synthetic/LPMC/{}.csv'.format(r))
    tmp = lpmc['hh_vehicles'].value_counts().sort_index(ascending=True)
    lpmc = np.zeros(len(nomis))
    for i in tmp.index:
        if i-1 >= len(lpmc):
            lpmc[len(lpmc)-1] += tmp[i]
        else:
            lpmc[i-1] = tmp[i]
    lpmc /= lpmc.sum()

    # Rejection sampling DATGAN
    datgan = pd.read_csv('../../data/synthetic/DATGAN/{}.csv'.format(r))
    tmp = datgan['hh_vehicles'].value_counts().sort_index(ascending=True)
    datgan = np.zeros(len(nomis))
    for i in tmp.index:
        if i-1 >= len(datgan):
            datgan[len(datgan)-1] += tmp[i]
        else:
            datgan[i-1] = tmp[i]
    datgan /= datgan.sum()

    # ciDATGAN
    cidatgan = pd.read_csv('../../data/synthetic/ciDATGAN/{}.csv'.format(r))
    tmp = cidatgan['hh_vehicles'].value_counts().sort_index(ascending=True)
    cidatgan = np.zeros(len(nomis))
    for i in tmp.index:
        if i-1 >= len(cidatgan):
            cidatgan[len(cidatgan)-1] += tmp[i]
        else:
            cidatgan[i-1] = tmp[i]
    cidatgan /= cidatgan.sum()

    # Histogram
    probs = [nomis, lpmc, datgan, cidatgan]

    dct = {}
    for l in labels:
        dct[l] = []

    idx = range(1, len(nomis)+1)

    for i in idx:
        for j, l in enumerate(labels):
            try:
                val = probs[j][i-1]
            except:
                val = 0
            dct[l].append(val)

    tmp_df = pd.DataFrame(dct, index=idx)

    tmp_df.plot(figsize=(10,7), kind='bar', color=colors)

    plt.legend()
    plt.xticks(rotation=45)
    plt.title('Distribution of vehicles per household for {}'.format(r))

    plt.savefig('../../figures/hh_vehicles/{}.png'.format(r), bbox_inches='tight')
    plt.close()

    # Compute the stats
    res1 = compute_stats(nomis, lpmc)
    res2 = compute_stats(nomis, datgan)
    res3 = compute_stats(nomis, cidatgan)
    for s in stats_str:
        res['LPMC'][s].append(res1[s])
        res['DATGAN'][s].append(res2[s])
        res['ciDATGAN'][s].append(res3[s])

100%|██████████| 33/33 [00:55<00:00,  1.68s/it]


In [19]:
for s in stats_str:
    tmp = [res['LPMC'][s], res['DATGAN'][s], res['ciDATGAN'][s]]
    df = pd.DataFrame(tmp, index=['LPMC', 'DATGAN', 'ciDATGAN'])

    plt.figure(figsize=(10,7))

    sns.violinplot(data=df.T, palette=colors[1:])

    plt.xticks([0,1,2], ['LPMC', 'DATGAN', 'ciDATGAN'])

    plt.ylabel('{}'.format(s.upper()))

    plt.savefig('../../figures/hh_vehicles/hh_vehicles_{}.png'.format(s), bbox_inches='tight')
    plt.savefig('../../figures/hh_vehicles/hh_vehicles_{}.pdf'.format(s), bbox_inches='tight')

    plt.close()