## Purpose

Travel time statistics are published on the [DfT stats page](https://www.gov.uk/government/collections/journey-time-statistics).  Minimum journey times to a variety of services at each [LSOA](http://webarchive.nationalarchives.gov.uk/20160106001702/http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/census/super-output-areas--soas-/index.html) are contained within a handfull of ODS spreadsheets. Each file contains sheets for different years and is in a wide format with non human readable column names.
This notebook demonstrates:

* Downloading the spreadsheets
* Saving the data to a pandas dataframe
* Renaming the variables and converting to a single table in the long data format (one row per observation).


### Download the files

In [1]:
import os
import requests 
import shutil
import pyexcel
import pandas as pd
import numpy as np

# Create folder to store data in, if it doesn't already exist
if os.path.isdir('stats-spreadsheets') == False:
    os.mkdir('stats-spreadsheets')

In [2]:
def download_data():   
    urls = {
        'employment': 'https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/627362/jts0501.ods',
        'primary schools': 'https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/627363/jts0502.ods',
        'secondary schools': 'https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/627365/jts0503.ods',
        'further education': 'https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/627366/jts0504.ods',
        'GP': 'https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/627367/jts0505.ods',
        'hospital': 'https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/627369/jts0506.ods',
        'food': 'https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/627370/jts0507.ods',
        'town centre': 'https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/627371/jts0508.ods'
    }

    for key, url in urls.items():
        r = requests.get(url, stream=True)
        if r.status_code == 200:
            with open('stats-spreadsheets/' + key + '.ods', 'wb') as out_file:
                shutil.copyfileobj(r.raw, out_file)
        else: print('Download failed')

## Data cleaning function

In [3]:
def min_travel_time(filepath, travel_time_columns_list, stat_name ):
    data_dict_2015 = pyexcel.get_dict(file_name=filepath,
                            sheet_name='2015',
                            start_row=6)

    data_dict_2014 = pyexcel.get_dict(file_name=filepath,
                                sheet_name='2014',
                                start_row=6)

    df_2015 = pd.DataFrame(data_dict_2015)
    df_2014 = pd.DataFrame(data_dict_2014)
    df_2015['year'] = 2015
    df_2014['year'] = 2014
    
    df = pd.concat([df_2015, df_2014])
    # Keep only the travel time cols
    df = df[(['year', 'LSOA_code'] + travel_time_columns_list)]
    # make long - ceates columns variable and value
    df = df.melt(id_vars = ['year', 'LSOA_code'])

    def detect_mode(x):
        if 'PT' in x:
            return('public transport')
        elif 'Cyc' in x:
            return('cycle')
        elif 'Car' in x:
            return('car')
        else:
            return('NA')
        
    df['mode'] = df.variable.apply(lambda x: detect_mode(x))
    df = df[df.LSOA_code != '']
    df_output = df.set_index(['year', 'LSOA_code', 'mode']).value.rename('travel_time').reset_index().drop_duplicates()
    df_output['nearest'] = stat_name
    return(df_output)

## Employment
Travel time to employment cetnres is different to the other statistics as there are different statistics according to the size of the employment centre. I will simplify and take the minimum of the minimum travel time over the different employment centre sizes for the given mode.

In [4]:
def employment_min_travel_time( ):
    filepath = 'stats-spreadsheets/employment.ods'
    stat_name = 'employment_centre'
    travel_time_columns_list = ['100EmpPTt', '100EmpCyct', '100EmpCart',
                                '500EmpPTt', '500EmpCyct', '500EmpCart',
                                '5000EmpPTt', '5000EmpCyct', '5000EmpCart']
    data_dict_2015 = pyexcel.get_dict(file_name=filepath,
                            sheet_name='2015',
                            start_row=6)

    data_dict_2014 = pyexcel.get_dict(file_name=filepath,
                                sheet_name='2014',
                                start_row=6)

    df_2015 = pd.DataFrame(data_dict_2015)
    df_2014 = pd.DataFrame(data_dict_2014)
    df_2015['year'] = 2015
    df_2014['year'] = 2014
    
    df = pd.concat([df_2015, df_2014])
    # Keep only the travel time cols
    df = df[(['year', 'LSOA_code'] + travel_time_columns_list)]
    # make long - ceates columns variable and value
    df = df.melt(id_vars = ['year', 'LSOA_code'])

    def detect_mode(x):
        if 'PT' in x:
            return('public transport')
        elif 'Cyc' in x:
            return('cycle')
        elif 'Car' in x:
            return('car')
        else:
            return('NA')
        
    df['mode'] = df.variable.apply(lambda x: detect_mode(x))
    df = df[df.LSOA_code != '']
    df_output = df.groupby(['year', 'LSOA_code', 'mode']).value.min().rename('travel_time').reset_index().drop_duplicates()
    df_output['nearest'] = stat_name
    return(df_output)

### LSOA meta data
Such as local authority and region of the LSOA. Taken from the primary schools file as thats the smallest and quickest to lead


In [5]:
def meta_data():
    filepath = 'stats-spreadsheets/primary schools.ods'
    data_dict_2015 = pyexcel.get_dict(file_name=filepath,
                            sheet_name='2015',
                            start_row=6,
                            column_limit=4)
    df = pd.DataFrame(data_dict_2015).drop_duplicates()
    df = df[df.LSOA_code != ''] 
    return(df)

# Putting it together

In [6]:
# download_data()
emp = employment_min_travel_time()
p_school = min_travel_time('stats-spreadsheets/primary schools.ods', ['PSPTt', 'PSCyct', 'PSCart'], 'primary_school')
s_school = min_travel_time('stats-spreadsheets/secondary schools.ods', ['SSPTt', 'SSCyct', 'SSCart'], 'secondary_school')
further_ed = min_travel_time('stats-spreadsheets/further education.ods', ['FEPTt', 'FECyct', 'FECart'], 'further_education')
gp = min_travel_time('stats-spreadsheets/GP.ods', ['GPPTt', 'GPCyct', 'GPCart'], 'GP')
hospital = min_travel_time('stats-spreadsheets/hospital.ods', ['HospPTt', 'HospCyct', 'HospCart'], 'hospital')
food = min_travel_time('stats-spreadsheets/food.ods', ['FoodPTt', 'FoodCyct', 'FoodCart'], 'food')
town_centre = min_travel_time('stats-spreadsheets/town centre.ods', ['TownPTt', 'TownCyct', 'TownCart'], 'town_centre')

In [7]:
df = pd.concat([emp, p_school, s_school, further_ed, gp, hospital, food, town_centre], axis=0)
df_meta_data = meta_data()
df = pd.merge(df_meta_data, df, how='outer', on=['LSOA_code'])
df.to_csv('01_clean_stats.csv', index=False)

In [8]:
df.shape

(1576512, 8)

In [9]:
df

Unnamed: 0,LSOA_code,Region,LA_Code,LA_Name,year,mode,travel_time,nearest
0,E01000001,London,E09000001,City of London,2014,car,6.75308,employment_centre
1,E01000001,London,E09000001,City of London,2014,cycle,6.61082,employment_centre
2,E01000001,London,E09000001,City of London,2014,public transport,3.64864,employment_centre
3,E01000001,London,E09000001,City of London,2015,car,6.15341,employment_centre
4,E01000001,London,E09000001,City of London,2015,cycle,6.50175,employment_centre
5,E01000001,London,E09000001,City of London,2015,public transport,3.0299,employment_centre
6,E01000001,London,E09000001,City of London,2015,public transport,6,primary_school
7,E01000001,London,E09000001,City of London,2014,public transport,6,primary_school
8,E01000001,London,E09000001,City of London,2015,cycle,8,primary_school
9,E01000001,London,E09000001,City of London,2014,cycle,8,primary_school


In [10]:
df.Region.value_counts()

South East                  258336
London                      232080
North West                  215856
East                        173472
West Midlands               167376
Yorkshire and The Humber    159216
South West                  157488
East Midlands               133152
North East                   79536
Name: Region, dtype: int64

In [11]:
df['mode'].value_counts()

car                 525504
cycle               525504
public transport    525504
Name: mode, dtype: int64

In [12]:
df.nearest.value_counts()

secondary_school     197064
primary_school       197064
further_education    197064
food                 197064
town_centre          197064
employment_centre    197064
GP                   197064
hospital             197064
Name: nearest, dtype: int64