# TFL Bike data prep
___

#### Data prep as part of my MSc thesis, "Using machine learning to analyse and predict Transport for London bike sharing habits in the post COVID-19 era".

The following code for downloading the data has been adopted from [Markus Hauru's](https://github.com/mhauru) analysis, 'Predicting Boris Bike usage'.



In [16]:
# importing libraries

import os
import pickle
import requests
import zipfile
import pandas as pd
import numpy as np
import scipy as sp
import statsmodels.api as sm
from sklearn import linear_model, svm, neighbors, tree
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns
from pathlib import Path
from timeit import default_timer as timer
from IPython.display import set_matplotlib_formats
from urllib.parse import urlparse
import openpyxl

try:
    import xlrd
except Exception as e:
    msg = (
        "Please install the package xlrd: `pip install --user xlrd`"
        "It's an optional requirement for pandas, and we'll be needing it."
    )
    print(msg)
    raise e

In [5]:
# For pretty and exportable matplotlib plots.
# If you are running this yourself and want interactivity,
# try `%matplotlib widget` instead.
set_matplotlib_formats("svg")
%matplotlib inline
# %matplotlib widget
# Set a consistent plotting style across the notebook using Seaborn.
sns.set_style("darkgrid")
sns.set_context("notebook")
# Make pandas cooperate with pyplot
pd.plotting.register_matplotlib_converters()


  set_matplotlib_formats("svg")


1. Processing and cleaning the bike data
Before getting anywhere with it, we'll need to process the bike data quite a bit. The data comes in CSV files, each of which covers a period of time. Up first, we need to download the data from the TfL website. If you are running this code yourself, here's a script that does that. Be warned though, it's almost seven gigs of data. You can run it repeatedly, and it'll only download data that it doesn't have already.

In [6]:
bikefolder = "data/bikes"

In [7]:
def download_file(datafolder, url, verbosity=0):
    """Download the data from the given URL into the datafolder, unless it's
    already there. Return path to downloaded file.
    """
    # data folder variable for where the folder for where the downloaded file should be stores 
    # using the path() function to converted the data folder string into a path
    datafolder = Path(datafolder)
    datafolder.mkdir(parents=True, exist_ok=True)

    # using the url parse function to extract the file from the url and create a filepath for it to be stored
    a = urlparse(url)
    filename = Path(os.path.basename(a.path))
    filepath = datafolder / filename
    # Don't redownload if we already have this file.
    if filepath.exists():
        if verbosity > 1:
            print("Already have {}".format(filename))
    else:
        if verbosity > 0:
            print("Downloading {}".format(filename))
        # sends a GET request to the URL using the requests module and raises an exception if there is an error
        rqst = requests.get(url)
        rqst.raise_for_status()
        with open(filepath, "wb") as f:
            f.write(rqst.content)
    return filepath


In [8]:
# Adjust whether to print progress reports of the downloads.
# verbosity=0 is silence, verbosity=1 reports only when actually doing things,
# verbosity>1 also reports when there's nothing to do.
verbosity = 1

# Most files are individual CSV files, listed in bike_data_urls.txt. Download them.
urlsfile = "data/bikes/bike_data_urls.txt"
with open(urlsfile, "r") as f:
    urls = f.read().splitlines()
# There are a few comments in the file, marked by lines starting with #.
# Filter them out.
urls = [u for u in urls if u[0] != "#"]
for url in urls:
    download_file(bikefolder, url, verbosity)

# The early years come in zips. Download and unzip them.
zipsfolder = Path("data/bikes/bikezips")
bikezipurls = [
    "https://cycling.data.tfl.gov.uk/usage-stats/cyclehireusagestats-2012.zip",
    "https://cycling.data.tfl.gov.uk/usage-stats/cyclehireusagestats-2013.zip",
    "https://cycling.data.tfl.gov.uk/usage-stats/cyclehireusagestats-2014.zip",
    "https://cycling.data.tfl.gov.uk/usage-stats/2015TripDatazip.zip",
    "https://cycling.data.tfl.gov.uk/usage-stats/2016TripDataZip.zip",
]
# A list of CSV files that are already there. Only unzip if some of the files
# in the zip aren't present already.
current_csvs = sorted(os.listdir(bikefolder))
for url in bikezipurls:
    zippath = download_file(zipsfolder, url, verbosity)
    with zipfile.ZipFile(zippath, "r") as z:
        namelist = z.namelist()
        has_been_extracted = any(name not in current_csvs for name in namelist)
        if has_been_extracted:
            if verbosity > 0:
                print("Unzipping {}".format(zippath))
            z.extractall(bikefolder)
        else:
            if verbosity > 1:
                print("{} has already been extracted.".format(zippath))

# Finally, there's an odd one out: One week's data comes in as an .xlsx.
# Download it and use pandas to convert it to csv.
xlsxurl = "https://cycling.data.tfl.gov.uk/usage-stats/49JourneyDataExtract15Mar2017-21Mar2017.xlsx"
xlsxfile = download_file(bikefolder, xlsxurl)
csvfile = xlsxfile.with_suffix(".csv")
if not csvfile.exists():
    if verbosity > 0:
        print("Converting .xlsx to .csv.")
    pd.read_excel(xlsxfile).to_csv(csvfile, date_format="%d/%m/%Y %H:%M:%S")
else:
    if verbosity > 1:
        print("Already have {}".format(csvfile))

The data we have now lists on each line of the CSV file a single bike trip, with starting point and time, end point and time, and things like bike ID number. Here's an example.

In [9]:
example_file  = Path(bikefolder) / Path("47JourneyDataExtract01Mar2017-07Mar2017.csv")
pd.read_csv(example_file, encoding="ISO-8859-2").head()

Unnamed: 0,Rental Id,Duration,Bike Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name
0,62857677,3780.0,7851,06/03/2017 19:20,43.0,"Crawford Street, Marylebone",06/03/2017 18:17,811,"Westferry Circus, Canary Wharf"
1,62863035,540.0,4089,06/03/2017 22:17,295.0,"Swan Street, The Borough",06/03/2017 22:08,272,"Baylis Road, Waterloo"
2,62775896,600.0,4895,02/03/2017 21:27,295.0,"Swan Street, The Borough",02/03/2017 21:17,197,"Stamford Street, South Bank"
3,62747748,420.0,4347,01/03/2017 21:08,295.0,"Swan Street, The Borough",01/03/2017 21:01,803,"Southwark Street, Bankside"
4,62843939,420.0,3192,06/03/2017 09:28,193.0,"Bankside Mix, Bankside",06/03/2017 09:21,197,"Stamford Street, South Bank"


In [10]:
bikefolder

'data/bikes'

In [11]:
from glob import glob 

# using glob to list all the csv file in the bikefolder filepath
all_csv = glob(bikefolder+str('/*.csv'))
all_csv

['data/bikes\\01aJourneyDataExtract10Jan16-23Jan16.csv',
 'data/bikes\\01bJourneyDataExtract24Jan16-06Feb16.csv',
 'data/bikes\\02aJourneyDataExtract07Feb16-20Feb2016.csv',
 'data/bikes\\02bJourneyDataExtract21Feb16-05Mar2016.csv',
 'data/bikes\\03JourneyDataExtract06Mar2016-31Mar2016.csv',
 'data/bikes\\04JourneyDataExtract01Apr2016-30Apr2016.csv',
 'data/bikes\\05JourneyDataExtract01May2016-17May2016.csv',
 'data/bikes\\06JourneyDataExtract18May2016-24May2016.csv',
 'data/bikes\\07JourneyDataExtract25May2016-31May2016.csv',
 'data/bikes\\08JourneyDataExtract01Jun2016-07Jun2016.csv',
 'data/bikes\\09JourneyDataExtract08Jun2016-14Jun2016.csv',
 'data/bikes\\1. Journey Data Extract 01Jan-05Jan13.csv',
 'data/bikes\\1. Journey Data Extract 04Jan-31Jan 12.csv',
 'data/bikes\\1. Journey Data Extract 05Jan14-02Feb14.csv',
 'data/bikes\\10. Journey Data Extract 18Aug-13Sep13.csv',
 'data/bikes\\10. Journey Data Extract 21Aug-22 Aug12.csv',
 'data/bikes\\10a Journey Data Extract 20Sep15-03Oct

### 2019 data prep

In [12]:
# creating a list of csv files that contain '2019' and '2022' respectively
csv_2019 = [item for item in all_csv if '2019' in item]
csv_2022 = [item for item in all_csv if '2022' in item]

In [13]:
csv_2019

['data/bikes\\142JourneyDataExtract26Dec2018-01Jan2019.csv',
 'data/bikes\\143JourneyDataExtract02Jan2019-08Jan2019.csv',
 'data/bikes\\144JourneyDataExtract09Jan2019-15Jan2019.csv',
 'data/bikes\\145JourneyDataExtract16Jan2019-22Jan2019.csv',
 'data/bikes\\146JourneyDataExtract23Jan2019-29Jan2019.csv',
 'data/bikes\\147JourneyDataExtract30Jan2019-05Feb2019.csv',
 'data/bikes\\148JourneyDataExtract06Feb2019-12Feb2019.csv',
 'data/bikes\\149JourneyDataExtract13Feb2019-19Feb2019.csv',
 'data/bikes\\150JourneyDataExtract20Feb2019-26Feb2019.csv',
 'data/bikes\\151JourneyDataExtract27Feb2019-05Mar2019.csv',
 'data/bikes\\152JourneyDataExtract06Mar2019-12Mar2019.csv',
 'data/bikes\\153JourneyDataExtract13Mar2019-19Mar2019.csv',
 'data/bikes\\154JourneyDataExtract20Mar2019-26Mar2019.csv',
 'data/bikes\\155JourneyDataExtract27Mar2019-02Apr2019.csv',
 'data/bikes\\156JourneyDataExtract03Apr2019-09Apr2019.csv',
 'data/bikes\\157JourneyDataExtract10Apr2019-16Apr2019.csv',
 'data/bikes\\158Journey

In [14]:
# using list comprehension that reads each csv file from the list and gnerators a sequence of dataframes
dfs = (pd.read_csv(csv) for csv in csv_2019)

# concatenate csvs them into a single DataFrame using pd.concat()
# ignore_index=True parameter resets the index of the resulting DataFrame, so that it is a continuous sequence of integers.
data_2019 = pd.concat(dfs, ignore_index=True)

In [15]:
print(data_2019.shape)
data_2019.head()

(10388411, 9)


Unnamed: 0,Rental Id,Duration,Bike Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name
0,83252102,720,2077,31/12/2018 19:05,272,"Baylis Road, Waterloo",31/12/2018 18:53,94,"Bricklayers Arms, Borough"
1,83195883,120,10781,27/12/2018 19:47,93,"Cloudesley Road, Angel",27/12/2018 19:45,339,"Risinghill Street, Angel"
2,83196070,120,2977,27/12/2018 20:11,339,"Risinghill Street, Angel",27/12/2018 20:09,234,"Liverpool Road (N1 Centre), Angel"
3,83197932,660,10802,28/12/2018 07:35,282,"Royal London Hospital, Whitechapel",28/12/2018 07:24,698,"Shoreditch Court, Haggerston"
4,83176351,1380,15749,26/12/2018 11:55,785,"Aquatic Centre, Queen Elizabeth Olympic Park",26/12/2018 11:32,783,"Monier Road, Hackney Wick"


In [16]:
# 2019

## Add some extra variables to the dataset for use later in filtering

import datetime

## Feeding a specififed date format speeds up the pd.to_datetime function immeasurably, especially over large datasets
## e.g. http://stackoverflow.com/questions/32034689/why-is-pandas-to-datetime-slow-for-non-standard-time-format-such-as-2014-12-31

format = "%d/%m/%Y %H:%M"

## Some routes had dates with a seconds component, whereas some didn't - the below code cuts these seconds off
data_2019['Start Date']= data_2019['Start Date'].str[:16]

data_2019['Start Date Time']= pd.to_datetime(data_2019['Start Date'], format=format)

data_2019['Hour']= pd.to_datetime(data_2019['Start Date'], format=format).dt.hour

data_2019['Day']= pd.to_datetime(data_2019['Start Date'], format=format).dt.weekday

data_2019.head()


Unnamed: 0,Rental Id,Duration,Bike Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name,Start Date Time,Hour,Day
0,83252102,720,2077,31/12/2018 19:05,272,"Baylis Road, Waterloo",31/12/2018 18:53,94,"Bricklayers Arms, Borough",2018-12-31 18:53:00,18,0
1,83195883,120,10781,27/12/2018 19:47,93,"Cloudesley Road, Angel",27/12/2018 19:45,339,"Risinghill Street, Angel",2018-12-27 19:45:00,19,3
2,83196070,120,2977,27/12/2018 20:11,339,"Risinghill Street, Angel",27/12/2018 20:09,234,"Liverpool Road (N1 Centre), Angel",2018-12-27 20:09:00,20,3
3,83197932,660,10802,28/12/2018 07:35,282,"Royal London Hospital, Whitechapel",28/12/2018 07:24,698,"Shoreditch Court, Haggerston",2018-12-28 07:24:00,7,4
4,83176351,1380,15749,26/12/2018 11:55,785,"Aquatic Centre, Queen Elizabeth Olympic Park",26/12/2018 11:32,783,"Monier Road, Hackney Wick",2018-12-26 11:32:00,11,2


In [17]:
# 2019 filtering data - remove any rows that aren't from 2019
# remember the first csv contained data from 2018... 26Dec2018-01Jan2019.csv
bike_data_2019 = data_2019[data_2019['Start Date Time'].dt.year == 2019]
print(bike_data_2019.shape)

(10310063, 12)


In [19]:
# bike_data_2019 has no null values, perfect
#bike_data_2019.isnull().sum()

Rental Id            0
Duration             0
Bike Id              0
End Date             0
EndStation Id        0
EndStation Name      0
Start Date           0
StartStation Id      0
StartStation Name    0
Start Date Time      0
Hour                 0
Day                  0
dtype: int64

### 2022 data prep

- In September 2022 the column names change slightly and additional clumns have been added
- for example the 'Bike model' column has been added (classic or PBSC_EBIKE)

Cycle Hire Data - data format change & new data https://techforum.tfl.gov.uk/t/cycle-hire-data-data-format-change-new-data/2520

### Exploring the 2022 data

In [24]:
csv_2022 = [item for item in all_csv if '2022' in item]

In [25]:
# CSVs before September 2022 part 1 data 
# use slicing to includes all elements of the previous list except for the last 16
csv_2022_p1 = csv_2022[:-16]

# CSVs From september 12th 2022 
# use slicing to create a new list that includes only the last 16 elements
csv_2022_p2 = csv_2022[-16:]

In [26]:
# doing the same for the 2022 data
# passing errors within the csv files as per https://stackoverflow.com/questions/52105659/pandas-read-csv-unexpected-end-of-data-error
dfs_2022_p1 = (pd.read_csv(csv, engine='python', encoding='utf-8', on_bad_lines='skip') for csv in csv_2022_p1)
data_2022_p1 = pd.concat(dfs_2022_p1, ignore_index=True)

In [27]:
data_2022_p1.isnull().sum()
# for the part 1 data, there were 312144 records with null station ids  

#es_id_null = data_2022_p1.loc[data_2022_p1['EndStation Id'].isnull()] 
#es_id_null.sort_values(by='Start Date', ascending=False)

# filtering the data above reveal the journeys taken between 06/07/2022 00:00 and 12/07/2022 23:56 did not record an end station Id

Rental Id                 0
Duration                  0
Bike Id                   0
End Date                  0
EndStation Id        312144
EndStation Name           0
Start Date                0
StartStation Id           0
StartStation Name         0
dtype: int64

In [28]:
data_2022_p1.count()

Rental Id            8677104
Duration             8677104
Bike Id              8677104
End Date             8677104
EndStation Id        8364960
EndStation Name      8677104
Start Date           8677104
StartStation Id      8677104
StartStation Name    8677104
dtype: int64

In [29]:
# read in data with datetime data type for column 2 and column 5
dfs_2022_p2 = (pd.read_csv(csv) for csv in csv_2022_p2)
#dfs_2022_p2 = (pd.read_csv(csv, parse_dates={'Start date': 'datetime64', 'End date': 'datetime64'}) for csv in csv_2022_p2)
data_2022_p2 = pd.concat(dfs_2022_p2, ignore_index=True)

  dfs_2022_p2 = (pd.read_csv(csv) for csv in csv_2022_p2)


In [30]:
data_2022_p2.isnull().sum()

Number                  0
Start date              0
Start station number    0
Start station           0
End date                0
End station number      0
End station             0
Bike number             0
Bike model              0
Total duration          0
Total duration (ms)     0
dtype: int64

In [31]:
data_2022_p2.count()

Number                  2555077
Start date              2555077
Start station number    2555077
Start station           2555077
End date                2555077
End station number      2555077
End station             2555077
Bike number             2555077
Bike model              2555077
Total duration          2555077
Total duration (ms)     2555077
dtype: int64

In [32]:
# doing the same for the 2022 data
# passing errors within the csv files as per https://stackoverflow.com/questions/52105659/pandas-read-csv-unexpected-end-of-data-error
dfs_2022 = (pd.read_csv(csv, engine='python', encoding='utf-8', on_bad_lines='skip') for csv in csv_2022)
data_2022 = pd.concat(dfs_2022, ignore_index=True)

In [33]:
# check the data type of the 'date' column
print(data_2022['Start date'].dtype)

object


In [34]:
# 2022

# Let's clean this up and get all the data into single columns


#creating a copy of the orginal data
data_2022_clean = data_2022.copy()


In [35]:
#let's start by sorting out the date time formatting
format = "%d/%m/%Y %H:%M"
format2 = "%Y/%m/%d %H:%M"

data_2022_clean['Start Date'] = data_2022_clean['Start Date'].str[:16]
data_2022_clean['Start Date Time'] = pd.to_datetime(data_2022_clean['Start Date'], format=format)
data_2022_clean['Start Date Time 2']= pd.to_datetime(data_2022_clean['Start date'], format=format2)

In [36]:
data_2022_clean

Unnamed: 0,Rental Id,Duration,Bike Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name,Number,...,Start station,End date,End station number,End station,Bike number,Bike model,Total duration,Total duration (ms),Start Date Time,Start Date Time 2
0,115967515.0,1260.0,15338.0,01/01/2022 23:13,310.0,"Black Prince Road, Vauxhall",01/01/2022 22:52,529.0,"Manresa Road, Chelsea",,...,,,,,,,,,2022-01-01 22:52:00,NaT
1,116017034.0,720.0,19861.0,04/01/2022 19:08,11.0,"Brunswick Square, Bloomsbury",04/01/2022 18:56,804.0,"Good's Way, King's Cross",,...,,,,,,,,,2022-01-04 18:56:00,NaT
2,115895660.0,360.0,19666.0,29/12/2021 16:34,70.0,"Calshot Street , King's Cross",29/12/2021 16:28,57.0,"Guilford Street , Bloomsbury",,...,,,,,,,,,2021-12-29 16:28:00,NaT
3,116016563.0,480.0,19861.0,04/01/2022 18:46,804.0,"Good's Way, King's Cross",04/01/2022 18:38,57.0,"Guilford Street , Bloomsbury",,...,,,,,,,,,2022-01-04 18:38:00,NaT
4,116014412.0,1260.0,17235.0,04/01/2022 17:45,14.0,"Belgrove Street , King's Cross",04/01/2022 17:24,297.0,"Geraldine Street, Elephant & Castle",,...,,,,,,,,,2022-01-04 17:24:00,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11232176,,,,,,,,,,127641458.0,...,"Woodstock Grove, Shepherd's Bush",2022-12-26 01:51,200249,"Queen Mary's, Mile End",53664.0,CLASSIC,1h 49m 4s,6544593.0,NaT,2022-12-26 00:02:00
11232177,,,,,,,,,,127641459.0,...,"Curlew Street, Shad Thames",2022-12-26 00:34,200147,"Salmon Lane, Limehouse",54303.0,CLASSIC,32m 16s,1936877.0,NaT,2022-12-26 00:02:00
11232178,,,,,,,,,,127641453.0,...,"Curlew Street, Shad Thames",2022-12-26 00:49,200160,"Langdon Park, Poplar",21426.0,CLASSIC,49m 15s,2955280.0,NaT,2022-12-26 00:00:00
11232179,,,,,,,,,,127641454.0,...,"Millharbour, Millwall",2022-12-26 01:31,22167,"Millharbour, Millwall",54786.0,CLASSIC,1h 30m 27s,5427555.0,NaT,2022-12-26 00:00:00


In [37]:
data_2022_clean.loc[data_2022_clean['Start Date Time'].isnull(), 'Start Date Time'] = data_2022_clean['Start Date Time 2']

In [38]:
data_2022_clean.isnull().sum()

Rental Id               2555077
Duration                2555077
Bike Id                 2555077
End Date                2555077
EndStation Id           2867221
EndStation Name         2555077
Start Date              2555077
StartStation Id         2555077
StartStation Name       2555077
Number                  8677104
Start date              8677104
Start station number    8677104
Start station           8677104
End date                8677104
End station number      8677104
End station             8677104
Bike number             8677104
Bike model              8677104
Total duration          8677104
Total duration (ms)     8677104
Start Date Time               0
Start Date Time 2       8677104
dtype: int64

In [39]:
# transfering values from one pandas column to another pandas column only for null rows

data_2022_clean.loc[data_2022_clean['Rental Id'].isnull(), 'Rental Id'] = data_2022_clean['Number']
# converting from milliseconds to seconds, multipyling by 1000 
data_2022_clean.loc[data_2022_clean['Duration'].isnull(), 'Duration'] = data_2022_clean['Total duration (ms)'] / 1000
data_2022_clean.loc[data_2022_clean['Bike Id'].isnull(), 'Bike Id'] = data_2022_clean['Bike number']
data_2022_clean.loc[data_2022_clean['End Date'].isnull(), 'End Date'] = data_2022_clean['End date']
data_2022_clean.loc[data_2022_clean['EndStation Name'].isnull(), 'EndStation Name'] = data_2022_clean['End station']
data_2022_clean.loc[data_2022_clean['Start Date'].isnull(), 'Start Date'] = data_2022_clean['Start date']
data_2022_clean.loc[data_2022_clean['StartStation Name'].isnull(), 'StartStation Name'] = data_2022_clean['Start station']

#data_2022_clean.sort_values(by='Bike model', ascending=False)

In [40]:
data_2022_clean.isnull().sum()

Rental Id                     0
Duration                      0
Bike Id                       0
End Date                      0
EndStation Id           2867221
EndStation Name               0
Start Date                    0
StartStation Id         2555077
StartStation Name             0
Number                  8677104
Start date              8677104
Start station number    8677104
Start station           8677104
End date                8677104
End station number      8677104
End station             8677104
Bike number             8677104
Bike model              8677104
Total duration          8677104
Total duration (ms)     8677104
Start Date Time               0
Start Date Time 2       8677104
dtype: int64

In [41]:
#adding the additional columns
data_2022_clean['Hour']= data_2022_clean['Start Date Time'].dt.hour
data_2022_clean['Day']= data_2022_clean['Start Date Time'].dt.weekday

In [42]:
# removing columns that are no longer needed
data_2022_clean_drop = data_2022_clean.drop(['Number', 'Start date', 'Start station', 'End date', 'End station',
                                             'Bike number', 'Total duration', 'Total duration (ms)', 'Start Date Time 2'], axis=1)

In [43]:
data_2022_clean_drop

Unnamed: 0,Rental Id,Duration,Bike Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name,Start station number,End station number,Bike model,Start Date Time,Hour,Day
0,115967515.0,1260.000,15338.0,01/01/2022 23:13,310.0,"Black Prince Road, Vauxhall",01/01/2022 22:52,529.0,"Manresa Road, Chelsea",,,,2022-01-01 22:52:00,22,5
1,116017034.0,720.000,19861.0,04/01/2022 19:08,11.0,"Brunswick Square, Bloomsbury",04/01/2022 18:56,804.0,"Good's Way, King's Cross",,,,2022-01-04 18:56:00,18,1
2,115895660.0,360.000,19666.0,29/12/2021 16:34,70.0,"Calshot Street , King's Cross",29/12/2021 16:28,57.0,"Guilford Street , Bloomsbury",,,,2021-12-29 16:28:00,16,2
3,116016563.0,480.000,19861.0,04/01/2022 18:46,804.0,"Good's Way, King's Cross",04/01/2022 18:38,57.0,"Guilford Street , Bloomsbury",,,,2022-01-04 18:38:00,18,1
4,116014412.0,1260.000,17235.0,04/01/2022 17:45,14.0,"Belgrove Street , King's Cross",04/01/2022 17:24,297.0,"Geraldine Street, Elephant & Castle",,,,2022-01-04 17:24:00,17,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11232176,127641458.0,6544.593,53664.0,2022-12-26 01:51,,"Queen Mary's, Mile End",2022-12-26 00:02,,"Woodstock Grove, Shepherd's Bush",200214,200249,CLASSIC,2022-12-26 00:02:00,0,0
11232177,127641459.0,1936.877,54303.0,2022-12-26 00:34,,"Salmon Lane, Limehouse",2022-12-26 00:02,,"Curlew Street, Shad Thames",1213,200147,CLASSIC,2022-12-26 00:02:00,0,0
11232178,127641453.0,2955.280,21426.0,2022-12-26 00:49,,"Langdon Park, Poplar",2022-12-26 00:00,,"Curlew Street, Shad Thames",1213,200160,CLASSIC,2022-12-26 00:00:00,0,0
11232179,127641454.0,5427.555,54786.0,2022-12-26 01:31,,"Millharbour, Millwall",2022-12-26 00:00,,"Millharbour, Millwall",22167,22167,CLASSIC,2022-12-26 00:00:00,0,0


In [44]:
data_2022_clean_drop.isnull().sum()

Rental Id                     0
Duration                      0
Bike Id                       0
End Date                      0
EndStation Id           2867221
EndStation Name               0
Start Date                    0
StartStation Id         2555077
StartStation Name             0
Start station number    8677104
End station number      8677104
Bike model              8677104
Start Date Time               0
Hour                          0
Day                           0
dtype: int64

In [45]:
# let's rename a couple of columns to make it clearer
# we will rename the Start and End station number column 
# these columns actually terminal to the station 'terminalName' as per https://tfl.gov.uk/tfl/syndication/feeds/cycle-hire/livecyclehireupdates.xml

data_2022_clean_drop.rename(columns={'Start station number': 'SS Terminal Name', 'End station number': 'ES Terminal Name'})


Unnamed: 0,Rental Id,Duration,Bike Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name,SS Terminal Name,ES Terminal Name,Bike model,Start Date Time,Hour,Day
0,115967515.0,1260.000,15338.0,01/01/2022 23:13,310.0,"Black Prince Road, Vauxhall",01/01/2022 22:52,529.0,"Manresa Road, Chelsea",,,,2022-01-01 22:52:00,22,5
1,116017034.0,720.000,19861.0,04/01/2022 19:08,11.0,"Brunswick Square, Bloomsbury",04/01/2022 18:56,804.0,"Good's Way, King's Cross",,,,2022-01-04 18:56:00,18,1
2,115895660.0,360.000,19666.0,29/12/2021 16:34,70.0,"Calshot Street , King's Cross",29/12/2021 16:28,57.0,"Guilford Street , Bloomsbury",,,,2021-12-29 16:28:00,16,2
3,116016563.0,480.000,19861.0,04/01/2022 18:46,804.0,"Good's Way, King's Cross",04/01/2022 18:38,57.0,"Guilford Street , Bloomsbury",,,,2022-01-04 18:38:00,18,1
4,116014412.0,1260.000,17235.0,04/01/2022 17:45,14.0,"Belgrove Street , King's Cross",04/01/2022 17:24,297.0,"Geraldine Street, Elephant & Castle",,,,2022-01-04 17:24:00,17,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11232176,127641458.0,6544.593,53664.0,2022-12-26 01:51,,"Queen Mary's, Mile End",2022-12-26 00:02,,"Woodstock Grove, Shepherd's Bush",200214,200249,CLASSIC,2022-12-26 00:02:00,0,0
11232177,127641459.0,1936.877,54303.0,2022-12-26 00:34,,"Salmon Lane, Limehouse",2022-12-26 00:02,,"Curlew Street, Shad Thames",1213,200147,CLASSIC,2022-12-26 00:02:00,0,0
11232178,127641453.0,2955.280,21426.0,2022-12-26 00:49,,"Langdon Park, Poplar",2022-12-26 00:00,,"Curlew Street, Shad Thames",1213,200160,CLASSIC,2022-12-26 00:00:00,0,0
11232179,127641454.0,5427.555,54786.0,2022-12-26 01:31,,"Millharbour, Millwall",2022-12-26 00:00,,"Millharbour, Millwall",22167,22167,CLASSIC,2022-12-26 00:00:00,0,0


In [46]:
# 2022 filtering data - remove any rows that aren't from 2022
bike_data_2022 = data_2022_clean_drop[data_2022_clean_drop['Start Date Time'].dt.year == 2022]
print(bike_data_2022.shape)

(11166111, 15)


### Storing the data in an PostgreSQL databse

In [20]:
# psycopg2 library installed to connect to a PostgreSQL database from Python

import psycopg2
from sqlalchemy import create_engine

In [21]:
# connection to postgres database
conn = psycopg2.connect(
    user="postgres",
    password="password123",
    host="localhost",
    database="diss_data",
)


In [22]:
# Create a SQLAlchemy engine: Create a SQLAlchemy engine using the create_engine function, which will be used to write the DataFrame to the database.
engine = create_engine('postgresql+psycopg2://postgres:password123@localhost:5432/diss_data')

In [23]:
# Export the DataFrame to the database: Once you have the connection and engine set up, you can use the to_sql method of the DataFrame to export it to the database.
# save the DataFrame to the PostgreSQL database
# set the index parameter to False to avoid saving the DataFrame's index as a separate column in the database.
bike_data_2019.to_sql('bike_data_2019_tb', engine, if_exists='replace', index=False)

63

In [47]:
# save the DataFrame to the PostgreSQL database
bike_data_2022.to_sql('bike_data_2022_tb', engine, if_exists='replace', index=False)

111

### Transforming the dataframes into a matrix, whereby the value of each cell is the number of events per hour

In [2]:
import pandas as pd
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine

# connection to postgres database
conn = psycopg2.connect(
    user="postgres",
    password="password123",
    host="localhost",
    database="diss_data",
)

engine = sqlalchemy.create_engine('postgresql://postgres:password123@localhost:5432/diss_data')

# create a connection to the database
conn = psycopg2.connect(database="diss_data", user="postgres", password="password123", host="localhost", port="5432")

# define the SQL query to retrieve the data from the table
sql_query = "SELECT * FROM bike_data_2019_tb"

# use the read_sql function to read the table into a Pandas dataframe
df = pd.read_sql(sql_query, conn)


  df = pd.read_sql(sql_query, conn)


- to make initial analysis more managable, let's focus on data from July 2019

In [64]:
#copying the dataframe
bike_data_2019 = df.copy()

# Filter the DataFrame to only include rows where the date is from July 2019
bd_Jul_2019 = bike_data_2019[(bike_data_2019['Start Date Time'] >= '2019-07-01') & (bike_data_2019['Start Date Time'] <= '2019-07-31')]

In [65]:
def add_station_names(station_names, df, namecolumn, idcolumn):
    """Given a DataFrame df that has df[namecolumn] listing names of stations
    and df[idcolumn] listing station ID numbers, add to the dictionary
    station_names all the names that each ID is attached to.

    """
    namemaps = (
        df[[idcolumn, namecolumn]]
        .groupby(idcolumn)
        .aggregate(lambda x: x.unique())
    )
    for number, names in namemaps.iterrows():
        current_names = station_names.get(number, set())
        # The following two lines are a stupid dance around the annoying fact
        # that pd.unique sometimes returns a single value, sometimes a numpy
        # array of values, but since the single value is a string, it too is an
        # iterable.
        vals = names[0]
        new_names = set([vals]) if type(vals) == str else set(vals)
        current_names.update(new_names)
        station_names[number] = current_names

In [66]:
def add_station_names(station_names, df, namecolumn, idcolumn):
    """Given a DataFrame df that has df[namecolumn] listing names of stations
    and df[idcolumn] listing station ID numbers, add to the dictionary
    station_names all the names that each ID is attached to.
    """
    namemaps = df[[idcolumn, namecolumn]].groupby(idcolumn)[namecolumn].unique()
    for number, names in namemaps.items():
        current_names = station_names.get(number, set())
        current_names.update(names)
        station_names[number] = current_names

In [67]:
#bd_Jul_2019['StartStation Id']=bd_Jul_2019['StartStation Id'].astype(np.int64)

station_names = {}
add_station_names(station_names, bd_Jul_2019, "StartStation Id", "StartStation Name")

In [68]:
station_names

{'Abbey Orchard Street, Westminster': {108},
 'Abbotsbury Road, Holland Park': {559},
 "Aberdeen Place, St. John's Wood": {394},
 'Aberfeldy Street, Poplar': {554},
 'Abingdon Green, Westminster': {583},
 'Abingdon Villas, Kensington': {38},
 'Abyssinia Close, Clapham Junction': {776},
 'Ackroyd Drive, Bow': {505},
 'Ada Street, Hackney Central': {718},
 'Addison Road, Holland Park': {606},
 'Aintree Street, Fulham': {616},
 "Albany Street, The Regent's Park": {540},
 'Albert Bridge Road, Battersea Park': {710},
 'Albert Embankment, Vauxhall': {100},
 'Albert Gardens, Stepney': {483},
 'Albert Gate, Hyde Park': {303},
 'Albert Square, Stockwell': {802},
 'Alderney Street, Pimlico': {185},
 'Aldersgate Street, Barbican': {95},
 'Alfred Place, Bloomsbury': {364},
 'Alfreda Street, Battersea Park': {726},
 'All Saints Church, Portobello': {661},
 "All Saints' Road, Portobello": {643},
 'Allington Street, Victoria': {826},
 'Alma Road, Wandsworth': {724},
 'Alpha Grove, Millwall': {576},
 

In [69]:
def clean_datetime_column(df, colname, roundto="H"):
    """Parse df[colname] from strings to datetime objects, and round the times
    to the nearest hour. Also chop off from df any rows with times before
    2010-07-30 or after 2020-01-01, since these are nonsense. df is partially
    modified in place, but the return value should still be used.
    """
    if len(df[colname].iloc[0]) > 16:
        format = "%d/%m/%Y %H:%M:%S"
    else:
        format = "%d/%m/%Y %H:%M"
    df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
    df.loc[:, colname] = df[colname].dt.round(roundto)
    early_cutoff = pd.Timestamp(2010, 7, 30)  # When the program started.
    late_cutoff = pd.Timestamp(2020, 1, 1)  # Approximately now.
    df = df[(late_cutoff > df[colname]) & (df[colname] >= early_cutoff)]
    return df

In [70]:
#clean start and end dates
bd_Jul_2019_clean1 = clean_datetime_column(bd_Jul_2019, "Start Date", roundto="H")
bd_Jul_2019_clean2 = clean_datetime_column(bd_Jul_2019_clean1, "End Date", roundto="H")
bd_Jul_2019_clean2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Unnamed: 0,Rental Id,Duration,Bike Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name,Start Date Time,Hour,Day
4834333,88545096,120,13467,2019-07-02 13:00:00,320,"Queen Mother Sports Centre, Victoria",2019-07-02 13:00:00,177,"Ashley Place, Victoria",2019-07-02 12:41:00,12,1
4834334,88564845,1260,16261,2019-07-02 20:00:00,81,"Great Titchfield Street, Fitzrovia",2019-07-02 19:00:00,3,"Christopher Street, Liverpool Street",2019-07-02 19:13:00,19,1
4834337,88544492,360,12885,2019-07-02 12:00:00,177,"Ashley Place, Victoria",2019-07-02 12:00:00,185,"Alderney Street, Pimlico",2019-07-02 12:21:00,12,1
4834339,88545262,660,12172,2019-07-02 13:00:00,318,"Sackville Street, Mayfair",2019-07-02 13:00:00,17,"Hatton Wall, Holborn",2019-07-02 12:46:00,12,1
4834344,88500842,360,16335,2019-07-01 11:00:00,177,"Ashley Place, Victoria",2019-07-01 11:00:00,185,"Alderney Street, Pimlico",2019-07-01 11:18:00,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6126742,89511998,1140,13254,2019-07-26 22:00:00,697,"Charlotte Terrace, Angel",2019-07-26 22:00:00,136,"Queen Victoria Street, St. Paul's",2019-07-26 21:35:00,21,4
6126743,89463276,5100,8628,2019-07-25 18:00:00,130,"Tower Gardens , Tower",2019-07-25 17:00:00,136,"Queen Victoria Street, St. Paul's",2019-07-25 16:32:00,16,3
6126744,89455086,780,4207,2019-07-25 12:00:00,449,"Shadwell Station, Shadwell",2019-07-25 12:00:00,136,"Queen Victoria Street, St. Paul's",2019-07-25 11:37:00,11,3
6249003,89636625,240,517,2019-07-31 00:00:00,452,"St. Katharine's Way, Tower",2019-07-31 00:00:00,130,"Tower Gardens , Tower",2019-07-31 00:00:00,0,2


In [71]:
bd_Jul_2019_clean2.sort_values(by="End Date")

Unnamed: 0,Rental Id,Duration,Bike Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name,Start Date Time,Hour,Day
5095984,88487560,1260,13300,2019-07-01,808,"Stockwell Roundabout, Stockwell",2019-07-01 00:00:00,368,"Harriet Street, Knightsbridge",2019-07-01 00:05:00,0,0
5078204,88487527,360,401,2019-07-01,4,"St. Chad's Street, King's Cross",2019-07-01 00:00:00,93,"Cloudesley Road, Angel",2019-07-01 00:00:00,0,0
4837208,88487534,900,8699,2019-07-01,451,"Hermitage Court, Wapping",2019-07-01 00:00:00,506,"Bell Lane, Liverpool Street",2019-07-01 00:01:00,0,0
5088101,88487564,1140,4793,2019-07-01,693,"Felsham Road, Putney",2019-07-01 00:00:00,665,"Smugglers Way, Wandsworth",2019-07-01 00:06:00,0,0
4938665,88487556,1200,9470,2019-07-01,216,"Old Brompton Road, South Kensington",2019-07-01 00:00:00,191,"Hyde Park Corner, Hyde Park",2019-07-01 00:05:00,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6124390,89636465,1020,5027,2019-07-31,437,"Vauxhall Walk, Vauxhall",2019-07-31 00:00:00,839,"Sea Containers, South Bank",2019-07-30 23:31:00,23,1
6015515,89636369,1080,5117,2019-07-31,445,"Cheshire Street, Bethnal Green",2019-07-30 23:00:00,475,"Lightermans Road, Millwall",2019-07-30 23:19:00,23,1
6038339,89636382,600,8986,2019-07-31,192,"Wardour Street, Soho",2019-07-30 23:00:00,382,"Farm Street, Mayfair",2019-07-30 23:20:00,23,1
6094762,89635834,4620,14380,2019-07-31,620,"Surrey Lane, Battersea",2019-07-30 22:00:00,620,"Surrey Lane, Battersea",2019-07-30 22:26:00,22,1


In [75]:
def compute_single_events(df, which):
    """Read from df all the events, either departures or arrivals depending on
    whether `which` is "Start" or "End", and collect them in a DataFrame that
    lists event counts per station and time.
    """
    stationcol = "{}Station Id".format(which)
    datecol = "{} Date".format(which)
    events = (
        df.rename(columns={stationcol: "Station", datecol: "Date"})
        .groupby(["Date", "Station"])
        .size()
        .unstack("Station")
    )
    return events

In [77]:
compute_single_events(bd_Jul_2019_clean2, "End")

Station,1,2,3,4,5,6,7,8,9,10,...,829,830,831,832,833,834,835,836,838,839
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-07-01 00:00:00,,,,1.0,,,,,,,...,,,,,,,,,,
2019-07-01 01:00:00,,,,1.0,,,,1.0,,,...,,,,,1.0,,,,,
2019-07-01 02:00:00,,,,,,1.0,,,,,...,,,,,,,,,,
2019-07-01 03:00:00,,,,,,,,,,,...,,,,,,,,,,
2019-07-01 04:00:00,,,,,,,,,,,...,,,,1.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-07-30 20:00:00,1.0,,2.0,1.0,3.0,2.0,2.0,2.0,1.0,,...,1.0,2.0,3.0,4.0,3.0,,,,2.0,6.0
2019-07-30 21:00:00,,,,1.0,,1.0,,1.0,1.0,,...,,,,2.0,1.0,,,,,
2019-07-30 22:00:00,,,1.0,,2.0,,1.0,1.0,,,...,1.0,3.0,1.0,3.0,,,,,,
2019-07-30 23:00:00,,1.0,,,,,1.0,,,1.0,...,,1.0,,,1.0,,,,1.0,1.0


In [78]:
def compute_both_events(df):
    """Read from df all the events, both arrivals and departures, and collect
    them in a DataFrame that lists event counts per station and time.
    """
    arrivals = compute_single_events(df, "End")
    departures = compute_single_events(df, "Start")
    both = (
        pd.concat(
            [arrivals, departures], keys=["Arrivals", "Departures"], axis=1
        )
        .reorder_levels([1, 0], axis=1)
        .fillna(0.0)
    )
    return both

In [79]:
compute_both_events(bd_Jul_2019_clean2)

Station,1,2,3,4,5,6,7,8,9,10,...,829,830,831,832,833,834,835,836,838,839
Unnamed: 0_level_1,Arrivals,Arrivals,Arrivals,Arrivals,Arrivals,Arrivals,Arrivals,Arrivals,Arrivals,Arrivals,...,Departures,Departures,Departures,Departures,Departures,Departures,Departures,Departures,Departures,Departures
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-07-01 00:00:00,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2019-07-01 01:00:00,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-07-01 02:00:00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-07-01 03:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-07-01 04:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-07-30 20:00:00,1.0,0.0,2.0,1.0,3.0,2.0,2.0,2.0,1.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,5.0,4.0
2019-07-30 21:00:00,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-07-30 22:00:00,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0
2019-07-30 23:00:00,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [80]:
def castable_to_int(obj):
    """Return True if obj is castable to int, False otherwise."""
    try:
        int(obj)
        return True
    except ValueError:
        return False

In [84]:
def cast_to_int(df, colname):
    """Cast df[colname] to dtype int. All rows that are not castable to int are
    dropped. df is partially modified in place, but the return value should be
    used.
    """
    try:
        df = df.astype({colname: np.int_}, copy=False)
    except ValueError:
        castable_rows = df[colname].apply(castable_to_int)
        df = df[castable_rows]
        df = df.astype({colname: np.int_}, copy=False)
    return df


In [82]:
bikefolder = "data/bikes"

In [85]:
# events is the DataFrame we are constructing. First check if it's already on
# disk.
events_path = Path("./events.p")
if events_path.exists():
    events = pd.read_pickle(events_path)
else:
    # Collect the paths to all the CSV files.
    datafiles = sorted(os.listdir(bikefolder))
    folderpath = Path(bikefolder)
    datapaths = [folderpath / Path(file) for file in datafiles]
    datapaths = [p for p in datapaths if p.suffix == ".csv"]

    # Initialize a dictionary that will have as keys station ID numbers, and as
    # values sets that include all the names this station has had in the files.
    station_allnames = {}

    # Each CSV file will list events in some time window. We process them
    # one-by-one, collect all the DataFrames for individual time windows to
    # `pieces`, and concatenate them at the end.

    pieces = []
    # Columns of the CSV files that we need.
    cols = [
        "Duration",
        "End Date",
        "EndStation Id",
        "EndStation Name",
        "Start Date",
        "StartStation Id",
        "StartStation Name",
    ]
    # At least one CSV file gives us trouble because it doesn't list station
    # IDs, only station names. We'll collect the paths to those CSV files to
    # `problem_paths` and deal with them at the end.
    problem_paths = []
    for path in datapaths:
        print("Processing {}".format(path))
        try:
            df = pd.read_csv(path, usecols=cols, encoding="ISO-8859-2")
        except ValueError as e:
            # Some files have missing or abnormaly named columns. We'll deal
            # with them later.
            problem_paths.append(path)
            continue
        # Drop any rows that have missing values.
        df = df[~df.isna().any(axis=1)]
        # Drop any anomalously short trips. Probably somebody just taking a
        # bike and putting it right back in. Durations are in seconds.
        df = df[df["Duration"] > 60]
        # Cast the columns to the right types. This is easier ones NAs have
        # been dropped.
        df = cast_to_int(df, "EndStation Id")
        df = cast_to_int(df, "StartStation Id")
        # Turn the date columns from strings into datetime objects rounded to
        # the hour.
        df = clean_datetime_column(df, "End Date")
        df = clean_datetime_column(df, "Start Date")
        events = compute_both_events(df)
        pieces.append(events)

        # Add station names appearing in this file to our collection of names.
        add_station_names(
            station_allnames, df, "EndStation Name", "EndStation Id"
        )
        add_station_names(
            station_allnames, df, "StartStation Name", "StartStation Id"
        )

    # Now that we've collected all the different names that the same station
    # goes by, we'll pick one of them to be the name we'll use. We do this by
    # just picking the one that is alphabetically first. We'll also make a
    # dictionary that goes the other way around, for each name it gives the
    # corresponding station ID.
    station_ids = {}
    station_names = {}
    for k, v in station_allnames.items():
        v = sorted(v)
        station_names[k] = v[0]
        for name in v:
            station_ids[name] = k

    def get_station_id(name):
        try:
            return station_ids[name]
        except KeyError:
            return np.nan

    # Let's deal with the problem cases. They are ones that are missing station
    # ID columns.  They do have the station names though, so we'll use those
    # to, with the above dictionary to get the IDs.
    print("Doing the problem cases ({} of them).".format(len(problem_paths)))
    safe_cols = [
        "Duration",
        "End Date",
        "EndStation Name",
        "Start Date",
        "StartStation Name",
    ]
    for path in problem_paths:
        print(path)
        df = pd.read_csv(path, usecols=safe_cols, encoding="ISO-8859-2")
        # Drop any rows that have missing values.
        df = df[~df.isna().any(axis=1)]
        # Drop any anomalously short trips. Probably somebody just taking a
        # bike and putting it right back in.
        df = df[df["Duration"] > 60]
        # Add a column of station IDs, based on names.
        df["EndStation Id"] = df["EndStation Name"].apply(get_station_id)
        df["StartStation Id"] = df["StartStation Name"].apply(get_station_id)
        # Turn the date columns from strings into datetime objects rounded to
        # the hour.
        clean_datetime_column(df, "End Date")
        clean_datetime_column(df, "Start Date")
        events_piece = compute_both_events(df)
        pieces.append(events_piece)

    # Finally, concatenate all the data we've accumulated into a single
    # DataFrame.
    events = pd.concat(pieces).fillna(0.0)
    # Several files may have contained entries for the same hour, which means
    # that events has duplicate entries in the index. Get rid of them by
    # summing.
    events = events.groupby("Date").sum().sort_index()
    # Finally rename the columns according to the chosen names for stations.
    events = events.rename(mapper=station_names, axis=1, level=0)

    # Store the file on disk so we can read it later.
    events.to_pickle(events_path)


Processing data\bikes\01aJourneyDataExtract10Jan16-23Jan16.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\01bJourneyDataExtract24Jan16-06Feb16.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\02aJourneyDataExtract07Feb16-20Feb2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\02bJourneyDataExtract21Feb16-05Mar2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\03JourneyDataExtract06Mar2016-31Mar2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\04JourneyDataExtract01Apr2016-30Apr2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\05JourneyDataExtract01May2016-17May2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\06JourneyDataExtract18May2016-24May2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\07JourneyDataExtract25May2016-31May2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\08JourneyDataExtract01Jun2016-07Jun2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\09JourneyDataExtract08Jun2016-14Jun2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\1. Journey Data Extract 01Jan-05Jan13.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\1. Journey Data Extract 04Jan-31Jan 12.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\1. Journey Data Extract 05Jan14-02Feb14.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\10. Journey Data Extract 18Aug-13Sep13.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\10. Journey Data Extract 21Aug-22 Aug12.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\10JourneyDataExtract15Jun2016-21Jun2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\10a Journey Data Extract 20Sep15-03Oct15.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\10a. Journey Data Extract 14Sep14-27Sep14.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\10b Journey Data Extract 04Oct15-17Oct15.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\10b. Journey Data Extract 28Sep14-11Oct14.csv


  df = pd.read_csv(path, usecols=cols, encoding="ISO-8859-2")
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\11. Journey Data Extract 14Sep13-12Oct13.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\11. Journey Data Extract 23Aug-25 Aug12.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\11JourneyDataExtract22Jun2016-28Jun2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\11a Journey Data Extract 18Oct15-31Oct15.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\11a. Journey Data Extract 12Oct14-08Nov14.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\11b Journey Data Extract 01Nov15-14Nov15.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\11b. Journey Data Extract 12Oct14-08Nov14.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\12. Journey Data Extract 13Oct13-09Nov13.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\12. Journey Data Extract 26Aug-27 Aug12.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\12JourneyDataExtract29Jun2016-05Jul2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\12a Journey Data Extract 15Nov15-27Nov15.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\12a. Journey Data Extract 09Nov14-06Dec14.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\12b Journey Data Extract 28Nov15-12Dec15.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\12b. Journey Data Extract 09Nov14-06Dec14.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\13. Journey Data Extract 10Nov13-07Dec13.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\13. Journey Data Extract 28Aug-29 Aug12.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\13JourneyDataExtract06Jul2016-12Jul2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\13a Journey Data Extract 13Dec15-24Dec15.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\13a. Journey Data Extract 07Dec14-21Dec14.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\13b Journey Data Extract 25Dec15-09Jan16.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\13b. Journey Data Extract 22Dec14-03Jan15.csv


  df = pd.read_csv(path, usecols=cols, encoding="ISO-8859-2")
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\14. Journey Data Extract 08Dec13-04Jan14.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\14. Journey Data Extract 30Aug-31 Aug12.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\142JourneyDataExtract26Dec2018-01Jan2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\143JourneyDataExtract02Jan2019-08Jan2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\144JourneyDataExtract09Jan2019-15Jan2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\145JourneyDataExtract16Jan2019-22Jan2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\146JourneyDataExtract23Jan2019-29Jan2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\147JourneyDataExtract30Jan2019-05Feb2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\148JourneyDataExtract06Feb2019-12Feb2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\149JourneyDataExtract13Feb2019-19Feb2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\14JourneyDataExtract13Jul2016-19Jul2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\15. Journey Data Extract 01Sep-30Sep12.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\150JourneyDataExtract20Feb2019-26Feb2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\151JourneyDataExtract27Feb2019-05Mar2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\152JourneyDataExtract06Mar2019-12Mar2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\153JourneyDataExtract13Mar2019-19Mar2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\154JourneyDataExtract20Mar2019-26Mar2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\155JourneyDataExtract27Mar2019-02Apr2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\156JourneyDataExtract03Apr2019-09Apr2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\157JourneyDataExtract10Apr2019-16Apr2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\158JourneyDataExtract17Apr2019-23Apr2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\159JourneyDataExtract24Apr2019-30Apr2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\15JourneyDataExtract20Jul2016-26Jul2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\16. Journey Data Extract 01Oct-31Oct12.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\160JourneyDataExtract01May2019-07May2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\161JourneyDataExtract08May2019-14May2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\162JourneyDataExtract15May2019-21May2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\163JourneyDataExtract22May2019-28May2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\164JourneyDataExtract29May2019-04Jun2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\165JourneyDataExtract05Jun2019-11Jun2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\166JourneyDataExtract12Jun2019-18Jun2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\167JourneyDataExtract19Jun2019-25Jun2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\168JourneyDataExtract26Jun2019-02Jul2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\169JourneyDataExtract03Jul2019-09Jul2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\16JourneyDataExtract27Jul2016-02Aug2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\17. Journey Data Extract 01Nov-30Nov12.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\170JourneyDataExtract10Jul2019-16Jul2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\171JourneyDataExtract17Jul2019-23Jul2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\172JourneyDataExtract24Jul2019-30Jul2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\173JourneyDataExtract31Jul2019-06Aug2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\174JourneyDataExtract07Aug2019-13Aug2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\175JourneyDataExtract14Aug2019-20Aug2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\176JourneyDataExtract21Aug2019-27Aug2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\177JourneyDataExtract28Aug2019-03Sep2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\178JourneyDataExtract04Sep2019-10Sep2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\179JourneyDataExtract11Sep2019-17Sep2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\17JourneyDataExtract03Aug2016-09Aug2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\18. Journey Data Extract 01Dec-31Dec12.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\180JourneyDataExtract18Sep2019-24Sep2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\181JourneyDataExtract25Sep2019-01Oct2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\182JourneyDataExtract02Oct2019-08Oct2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\183JourneyDataExtract09Oct2019-15Oct2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\184JourneyDataExtract16Oct2019-22Oct2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\185JourneyDataExtract23Oct2019-29Oct2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\186JourneyDataExtract30Oct2019-05Nov2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\187JourneyDataExtract06Nov2019-12Nov2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\188JourneyDataExtract13Nov2019-19Nov2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\189JourneyDataExtract20Nov2019-26Nov2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\18JourneyDataExtract10Aug2016-16Aug2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\190JourneyDataExtract27Nov2019-03Dec2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\191JourneyDataExtract04Dec2019-10Dec2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\192JourneyDataExtract11Dec2019-17Dec2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\193JourneyDataExtract18Dec2019-24Dec2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\194JourneyDataExtract25Dec2019-31Dec2019.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\19JourneyDataExtract17Aug2016-23Aug2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\1a.JourneyDataExtract04Jan15-17Jan15.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\1b.JourneyDataExtract18Jan15-31Jan15.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\2. Journey Data Extract 03Feb14-01Mar14.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\2. Journey Data Extract 06Jan-02Feb13.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\2. Journey Data Extract_01Feb-29Feb 12.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\20JourneyDataExtract24Aug2016-30Aug2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\21JourneyDataExtract31Aug2016-06Sep2016.csv
Processing data\bikes\22JourneyDataExtract07Sep2016-13Sep2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\23JourneyDataExtract14Sep2016-20Sep2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\24JourneyDataExtract21Sep2016-27Sep2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\25JourneyDataExtract28Sep2016-04Oct2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\26JourneyDataExtract05Oct2016-11Oct2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\27JourneyDataExtract12Oct2016-18Oct2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\28JourneyDataExtract19Oct2016-25Oct2016.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)
  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


Processing data\bikes\298JourneyDataExtract29Dec2021-04Jan2022.csv


  df.loc[:, colname] = pd.to_datetime(df[colname], format=format)
  df.loc[:, colname] = df[colname].dt.round(roundto)


IndexError: single positional indexer is out-of-bounds