# Capita

Technical Challenge for Data Science Candidates

In [36]:
import numpy as np
import pandas as pd
import math
import json

from pyjstat import pyjstat
from os import path

import scipy.stats as st
import statsmodels as sm
import statsmodels.api as smi

import matplotlib
from cycler import cycler
import matplotlib.pyplot as plt

# this is the local Utility module
# import rvlt

%load_ext autoreload
%autoreload 2

pd.__version__

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'0.24.2'

In [38]:
# If you turn this feature on, you can display each result as it happens.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [39]:
fl2 = 'stats.pickle'
path.exists(fl2)

True

In [53]:
# There's a future warning - minor change made in pyjstat
if not path.exists(fl2):
    fl1 = 'https://www.nomisweb.co.uk/api/v01/dataset/NM_31_1.jsonstat.json'
    dataset = pyjstat.Dataset.read(fl1)
    df = dataset.write('dataframe')
    df.to_pickle(fl2)
else:
    df = pd.read_pickle(fl2)

In [109]:
## Categories can be simpler
df.info()
df[df.select_dtypes(['object']).columns] = df.select_dtypes(['object']).apply(lambda x: x.astype('category'))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22800 entries, 0 to 22799
Data columns (total 6 columns):
date         22800 non-null category
geography    22800 non-null category
sex          22800 non-null category
age          22800 non-null category
measures     22800 non-null category
value        20358 non-null float64
dtypes: category(5), float64(1)
memory usage: 292.3 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22800 entries, 0 to 22799
Data columns (total 6 columns):
date         22800 non-null category
geography    22800 non-null category
sex          22800 non-null category
age          22800 non-null category
measures     22800 non-null category
value        20358 non-null float64
dtypes: category(5), float64(1)
memory usage: 292.3 KB


In [118]:
# Some filtering
# Most recent year
# df[df.date == max(df.date)]
df1 = df[ (df.date == max(df.date)) & df.age.str.match('All ages') & df.measures.str.match('Value') ]
df1

Unnamed: 0,date,geography,sex,age,measures,value
22200,2018,Wales,Male,All ages,Value,1547300.0
22250,2018,Wales,Female,All ages,Value,1591300.0
22300,2018,Wales,Total,All ages,Value,3138600.0
22350,2018,Scotland,Male,All ages,Value,2648800.0
22400,2018,Scotland,Female,All ages,Value,2789300.0
22450,2018,Scotland,Total,All ages,Value,5438100.0
22500,2018,Northern Ireland,Male,All ages,Value,926200.0
22550,2018,Northern Ireland,Female,All ages,Value,955400.0
22600,2018,Northern Ireland,Total,All ages,Value,1881600.0
22650,2018,England and Wales,Male,All ages,Value,29215300.0


In [119]:
# Make a deep copy of a few columns
df1 = df1[['geography', 'sex', 'value']].copy(deep=True)

In [120]:
# England and Wales includes Wales, which we have separately
# Get England and Wales and subtract the value for Wales
# and re-label

df2 = df1[df1.geography.str.match('England and Wales')].copy() # because this is view
df2

v = df1[ df1.geography.str.match('England and Wales')]['value'].values - df1[ df1.geography.str.match('Wales')]['value'].values
df2['value'] = v
df2['geography'] = 'England'
df2

Unnamed: 0,geography,sex,value
22650,England and Wales,Male,29215300.0
22700,England and Wales,Female,29900600.0
22750,England and Wales,Total,59115800.0


Unnamed: 0,geography,sex,value
22650,England,Male,27668000.0
22700,England,Female,28309300.0
22750,England,Total,55977200.0


In [121]:
## Append that table to the original with England and Wales removed
df3 = df1[ ~( df1.geography.str.match('England and Wales') ) ].append(df2).copy(deep=True).reset_index()
del df1, df2
df3

Unnamed: 0,geography,sex,value
22200,Wales,Male,1547300.0
22250,Wales,Female,1591300.0
22300,Wales,Total,3138600.0
22350,Scotland,Male,2648800.0
22400,Scotland,Female,2789300.0
22450,Scotland,Total,5438100.0
22500,Northern Ireland,Male,926200.0
22550,Northern Ireland,Female,955400.0
22600,Northern Ireland,Total,1881600.0
22650,England,Male,27668000.0


In [130]:
df4 = df3.groupby(['sex']).sum().reset_index()
df4['geography'] = 'UK'
df4

Unnamed: 0,sex,value,geography
0,Female,33645300.0,UK
1,Male,32790300.0,UK
2,Total,66435500.0,UK


In [129]:
df3.append(df4)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,geography,sex,value
22200,Wales,Male,1547300.0
22250,Wales,Female,1591300.0
22300,Wales,Total,3138600.0
22350,Scotland,Male,2648800.0
22400,Scotland,Female,2789300.0
22450,Scotland,Total,5438100.0
22500,Northern Ireland,Male,926200.0
22550,Northern Ireland,Female,955400.0
22600,Northern Ireland,Total,1881600.0
22650,England,Male,27668000.0
