In [1]:
import pandas as pd

In [2]:
# Using Wave 6 (Jan 2014- Jun 2016) of UKHLS Understanding Society, as it is the most recent wave to include travel behaviour.
# ind_resp contains the adult individual data
data = pd.read_csv('../data/ukhls_w6/f_indresp.tab',delimiter='\t')

In [3]:
# In its raw form, the data is massive, so we need to cut it down a little.
data.shape # 45,294 rows, 2056 columns.

(45294, 2056)

## Clean and Prepare Understanding Society Data

In [4]:
# Firstly, we'll restrict responses to the 7,071 people in London
data = data[data['f_gor_dv'] == 7]
data.shape

(7071, 2056)

In [440]:
# Now we want to cut down the columns to the specifics we want for the ABM

# Agent demographics:
# Age - 16-64 in bands - 16-24; 25-34; 35-44; 45-54; 55-64; 65-74. (f_age_dv)
# Sex - Female, Male (f_sex)
# Ethnicity - Asian, Black, Mixed, Other, White (f_racel_dv)

# Agent socio-economics
# In paid employment - No, Yes (f_employ) and by categories (f_jbstat)
# Mode of transport for journey to work (f_worktrav - for paid employment; f_jsworktrav - for self employed)
# Distance from work (f_workdis)

# Agent Car Behaviour
# Has use of car or van - No, Yes (f_caruse)
# Has a driving licence - No, Yes (f_drive)

# Agent Cycle Behaviour
# Frequency of using a bicycle (f_trbikefq) - if any frequency then owns bike
# check f_ynotbike1,2,3,96 to rule out people as don't own or cannot ride a bicycle.

In [5]:
data = data[['pidp','f_hidp','f_age_dv','f_sex','f_racel_dv','f_employ','f_jbstat','f_worktrav','f_jbpl','f_jsworktrav',
             'f_jspl','f_workdis','f_caruse','f_drive','f_trbikefq','f_ynotbike1','f_ynotbike2','f_ynotbike3','f_ynotbike96']]

In [6]:
# First exclude participants who are <16 (n=3), >=75 (n = 358) or for whom age is missing (n=24)
# This gives a dataset of 6,686 individuals
data = data[(data['f_age_dv'] >= 16) & (data['f_age_dv'] < 75)]

In [7]:
# Remove the 1 participant who answered 'don't know' for sex.
# This gives a dataset of 6,685 individuals
data = data[~(data['f_sex'] == -1)]

In [8]:
# Remove individuals who are missing on ethnic group (n=327)
# This gives a dataset of 6,387 individuals
data = data[~(data['f_racel_dv'] == -9)]

In [9]:
# Using both f_employ (in paid employment) and f_jbstat (current employment situation) we can avoid losing proxies.
# First remove the individuals who we really don't have info for. f_employ in -2,-1 and f_jbstat in -2 (n=5)
data = data[~((data['f_employ'].isin([-2,-1])) & (data['f_jbstat'] == -2))]
# Everyone else can be classified now in terms of their economic activity (n=6,382)

In [10]:
# Remove refusals (n=5) and don't know (n=3) from mode of transport for journey to work. 
# Also remove people who have 'other' modes, it's not clear what these are (n=13)
# This gives a dataset of 6,361 individuals
data = data[~(data['f_worktrav'].isin([-1,-2,97]))]

In [11]:
# Remove refusals (n=1) and don't know (n=5) from self-employed mode of transport for journey to work. 
# Also remove people who have 'other' modes, it's not clear what these are (n=6)
# This gives a dataset of 6,349 individuals
data = data[~(data['f_jsworktrav'].isin([-1,-2,97]))]

In [12]:
# Remove don't know (-1; n=5) and refusal (-2,n=1) from car use
data = data[~(data['f_caruse'].isin([-1,-2]))]
# Remove don't know (-1; n=1) refusals for driving license (-2; n=4)
data = data[~(data['f_drive'].isin([-1,-2]))]
# 16 year olds are inelligible to drive, or to hold a driving license so exclude missing or inelligible except if 16. (n=11)
data = data[~((data['f_caruse'] == -8) & (data['f_drive'].isin([-9,-8])) & (data['f_age_dv'] > 16))]
# This gives a dataset of 6,327 individuals

In [13]:
# Remove don't know (-1; n=8) and refuse (-2; n=5) from frequency of using a bicycle
data = data[~(data['f_trbikefq'].isin([-1,-2]))]
# Also remove don't know (-1; n=3) and refuse (-2; n=1) from why not use a bicycle
data = data[~(data['f_ynotbike1'].isin([-1,-2]))]
# This gives a dataset of 6,310 individuals

In [14]:
# Reindex new dataset
data.reset_index(drop=True, inplace=True)

## Derived Variables

In [15]:
# recode sex
data['sex'] = data['f_sex'].apply(lambda x: 'm' if x == 1 else 'f')
sex_freq = data['sex'].value_counts().to_frame().merge(data['sex'].value_counts(normalize=True).to_frame(),left_index=True,right_index=True)
sex_freq.rename(columns={'sex_x':'Frequency','sex_y':'Proportion'},inplace=True)
sex_freq

Unnamed: 0,Frequency,Proportion
f,3461,0.548494
m,2849,0.451506


In [16]:
# Age bands: 16-24; 25-34; 35-44; 45-54; 55-64
def band(a):
    if a < 25:
        return "16-24"
    elif a < 35:
        return "25-34"
    elif a < 45:
        return "35-44"
    elif a < 55:
        return "45-54"
    elif a< 65:
        return "55-64"
    else:
        return "65-74"

data['age_6cat'] = data['f_age_dv'].apply(lambda x: band(x))
age_freq = data['age_6cat'].value_counts().to_frame().merge(data['age_6cat'].value_counts(normalize=True).to_frame(),left_index=True,right_index=True)
age_freq.rename(columns={'age_6cat_x':'Frequency','age_6cat_y':'Proportion'},inplace=True)
age_freq.sort_index()

Unnamed: 0,Frequency,Proportion
16-24,1108,0.175594
25-34,1254,0.198732
35-44,1371,0.217274
45-54,1277,0.202377
55-64,778,0.123296
65-74,522,0.082726


In [17]:
# Ethnicity recode
eth = {1:'White', 2:'White', 3:'White', 4:'White', 5:'Mixed', 6:'Mixed', 7:'Mixed', 8:'Mixed', 9:'Asian', 10:'Asian',
       11:'Asian', 12:'Asian', 13:'Asian', 14:'Black', 15:'Black', 16:'Black', 17:'Other', 97:'Other'}

data['eth_5cat'] = data['f_racel_dv'].apply(lambda x: eth[x])
eth_freq = data['eth_5cat'].value_counts().to_frame().merge(data['eth_5cat'].value_counts(normalize=True).to_frame(),left_index=True,right_index=True)
eth_freq.rename(columns={'eth_5cat_x':'Frequency','eth_5cat_y':'Proportion'},inplace=True)
eth_freq

Unnamed: 0,Frequency,Proportion
White,2383,0.377655
Asian,1930,0.305864
Black,1394,0.220919
Mixed,362,0.057369
Other,241,0.038193


In [18]:
# Employment recode - paid employment (ft/pt); Self-employment, and other options including students/in training.
# Self-employment has to be seperated out as they don't answer question about work travel.
def employ(e,j):
    if e == 1 and j in [-2,-1,2]:
        return "Employed"
    elif e in [-9,-8,-7,-2,-1] and j in [2]:
        return "Employed"
    elif e == 2 and j in [2]:
        return "Employed"
    elif e == 1 and j == 1:
        return "SelfEmployed"
    elif e in [-9,-8,-7,-2,-1] and j == 1:
        return "SelfEmployed"
    elif e == 2 and j ==1:
        return "SelfEmployed"
    elif e == 1 and j in [3,4,5,6,7,8,9,10,11,97]:
        return "NotEmployed"
    elif e in [-9,-8,-7,-2,-1] and j in [3,4,5,6,7,8,9,10,11,97]:
        return "NotEmployed"
    elif e == 2 and j in [-2,-1,3,4,5,6,7,8,9,10,11,97]:
        return "NotEmployed"
    else:
        return "missing"

data['employ'] = data[['f_employ','f_jbstat']].apply(lambda x: employ(x['f_employ'],x['f_jbstat']),axis=1)
emp_freq = data['employ'].value_counts().to_frame().merge(data['employ'].value_counts(normalize=True).to_frame(),left_index=True,right_index=True)
emp_freq.rename(columns={'employ_x':'Frequency','employ_y':'Proportion'},inplace=True)
emp_freq

Unnamed: 0,Frequency,Proportion
Employed,3170,0.502377
NotEmployed,2516,0.398732
SelfEmployed,624,0.098891


In [19]:
# Work travel mode recode
trav = {1:'Car', 2:'Car', 3:'Car', 4:'Car', 5:'Car', 6:'Public', 7:'Public', 8:'Public', 9:'Cycle', 10:'Walk',
        -8:'inapplicable',-7:'missing'}
data['worktrav'] = data['f_worktrav'].apply(lambda x: trav[x])
# Recode people working mainly at home to 'Home' as travel mode (n=71)
data.loc[data['f_jbpl'] == 1,'worktrav'] = 'Home'

In [20]:
# Add self-employed travel mode recode to existing worktrav variable.
se_trav = {1:'Car',2:'Car',3:'Car',4:'Car',5:'Car',6:'Public',7:'Public',8:'Public',9:'Cycle',10:'Walk',-8:'inapplicable'}

data.loc[data['worktrav'] == 'inapplicable','worktrav'] = data[data['worktrav'] == 'inapplicable']['f_jsworktrav'].apply(lambda x: se_trav[x])

# Recode self-employed people working mainly 'at home' or 'from home' to 'Home' as travel mode (n=175)
data.loc[data['f_jspl'].isin([1,2]),'worktrav'] = 'Home'

wt_freq = data['worktrav'].value_counts().to_frame().merge(data['worktrav'].value_counts(normalize=True).to_frame(),left_index=True,right_index=True)
wt_freq.rename(columns={'worktrav_x':'Frequency','worktrav_y':'Proportion'},inplace=True)
wt_freq
# NB inapplicable effectively indicates non-commuters. Missing indicates no data.

Unnamed: 0,Frequency,Proportion
inapplicable,2403,0.380824
Public,1659,0.262916
Car,1153,0.182726
missing,383,0.060697
Walk,317,0.050238
Home,246,0.038986
Cycle,149,0.023613


In [21]:
# Use of a car recode - Assume caruse == 1 denotes car and 2 or -8 denotes no car
data['caruse'] = data['f_caruse'].apply(lambda x: 'Car' if x == 1 else 'NoCar')
car_freq = data['caruse'].value_counts().to_frame().merge(data['caruse'].value_counts(normalize=True).to_frame(),left_index=True,right_index=True)
car_freq.rename(columns={'caruse_x':'Frequency','caruse_y':'Proportion'},inplace=True)
car_freq

Unnamed: 0,Frequency,Proportion
NoCar,3189,0.505388
Car,3121,0.494612


In [22]:
# Use of a bicycle recode - Assume any frequency of using a bicycle denotes access (n = 892)
# Further assume that never learnt, disability, or no bike indicates no access. (n=2,269)
# Assume that anyone else does have access, they just don't cycle (n=742)
bike = {1:'Bike',2:'Bike',3:'Bike',4:'Bike',5:'Bike',6:'Bike',7:'Bike',8:'NoBike',-7:'missing',-10:'missing'}
def bike_class(f,n):
    if f in [1,2,3,4,5,6,7]:
        return "Bike"
    elif f == 8:
        if n == 1:
            return "Bike"
        else:
            return "NoBike"
    else:
        return "missing"

# First set responses based on frequencies
data['cycleuse'] = data[['f_trbikefq','f_ynotbike96']].apply(lambda x: bike_class(x['f_trbikefq'],x['f_ynotbike96']),axis=1)

cycle_freq = data['cycleuse'].value_counts().to_frame().merge(data['cycleuse'].value_counts(normalize=True).to_frame(),left_index=True,right_index=True)
cycle_freq.rename(columns={'cycleuse_x':'Frequency','cycleuse_y':'Proportion'},inplace=True)
cycle_freq
# 41.9% of respondents who answered are classified as having access to a bike
# The NTS 2017 claims that 42% of people aged 5+ in England own or have use of a bicycle.

Unnamed: 0,Frequency,Proportion
missing,2417,0.383043
NoBike,2263,0.358637
Bike,1630,0.25832


## Recode data to constraints

In [23]:
# sex age ethnicity
data['sex_age_eth'] = data[['sex','age_6cat','eth_5cat']].apply(lambda x: x['sex'] + x['age_6cat'].replace('-','_') + x['eth_5cat'],axis=1)
sae_freq = data['sex_age_eth'].value_counts().to_frame().merge(data['sex_age_eth'].value_counts(normalize=True).to_frame(),left_index=True,right_index=True)
sae_freq.rename(columns={'sex_age_eth_x':'Frequency','sex_age_eth_y':'Proportion'},inplace=True)
sae_freq.sort_index()

Unnamed: 0,Frequency,Proportion
f16_24Asian,177,0.028051
f16_24Black,159,0.025198
f16_24Mixed,55,0.008716
f16_24Other,21,0.003328
f16_24White,156,0.024723
f25_34Asian,243,0.03851
f25_34Black,146,0.023138
f25_34Mixed,44,0.006973
f25_34Other,23,0.003645
f25_34White,259,0.041046


In [24]:
# We can recode some of the missing values in worktrav based upon employ.
data.loc[data['worktrav']=='missing','worktrav'] = data.loc[data['worktrav']=='missing','employ'].apply(lambda x: 'inapplicable' if x=='NotEmployed' else 'missing')

## Create Analysis Dataset

In [26]:
analysis = data[['pidp','sex','age_6cat','eth_5cat','employ','worktrav','caruse','cycleuse','sex_age_eth','f_workdis']]

In [27]:
# Write analysis data to file
analysis.to_csv('../data/London_inds.csv')