# Predicting the number of cycles in London

In [12]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from plotly import tools
from cycle_flow import *

init_notebook_mode(connected=True)

## Load data

In [13]:
data_df = pd.read_excel('/project/london_cycle_flow/tfl-cycle-flows-tlrn.xlsx', sheet_name=1)

In [14]:
data_df

Unnamed: 0,Unnamed: 1.1,Unnamed: 2.1,Unnamed: 3.1,Unnamed: 4.1,Unnamed: 5.1,Comparisons analysis,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
Period and Financial year,Start period,End period,Pedal Cycle Counts Indexed,Periodic Target Index (adjusted for seasonality),,compared to same period last year,compared to previous period,compared to target for period,This year's rolling 13 period average compared...,Notable weather events,Av Temp (Celsius),Av Feels Like temp (Celsius),Total Rainfall (mm),Av Wet Hours per day
13_99/00,,,100,,,,,,,,,,,
01_00/01,,,96.27216550241805,,,,,,,,,,,
02_00/01,,,102.43148844707147,,,,,,,,,,,
03_00/01,,,105.16355454056959,,,,,,,,,,,
04_00/01,,,105.13668726491134,,,,,,,,,,,
05_00/01,,,102.83617678667383,,,,,,,,,,,
06_00/01,,,104.78237506716819,,,,,,,,,,,
07_00/01,,,103.95116872649113,,,,,,,,,,,
08_00/01,,,102.11411875335841,,,,,,,,,,,


## Clean data

In [15]:
cleaner_data = data_df.reset_index().drop(['level_5'], axis=1).dropna()[1:]

In [16]:
renaming = {
    'level_1': 'start',
    'level_2': 'end',
    'level_3': 'cycle_counts',
    'Unnamed: 5': 'avg_temp_c',
    'Unnamed: 7': 'total_rainfall_mm',
    'Unnamed: 8': 'avg_wet_hrs_per_day',
}

to_drop = [
    'level_0',
    'level_4',
    'Comparisons analysis',
    'Unnamed: 1',
    'Unnamed: 2',
    'Unnamed: 3',
    'Unnamed: 4',
    'Unnamed: 6'
]

In [17]:
cleaner_data = cleaner_data.rename(renaming, axis=1).drop(to_drop, axis=1).reset_index()
cleaner_data = cleaner_data.drop('index', axis=1)

In [18]:
cleaner_data

Unnamed: 0,start,end,cycle_counts,avg_temp_c,total_rainfall_mm,avg_wet_hrs_per_day
0,2010-11-14 00:00:00,2010-12-11 00:00:00,208.174,2.44643,17.6,1.225
1,2010-12-12 00:00:00,2011-01-08 00:00:00,125.683,2.375,44.8,1.43571
2,2011-04-01 00:00:00,2011-04-30 00:00:00,279.636,13.6,1.8,0.13
3,2011-05-01 00:00:00,2011-05-28 00:00:00,312.09,14.2071,22.9,0.346429
4,2011-07-24 00:00:00,2011-08-20 00:00:00,298.191,17.6607,43.8,0.739286
5,2011-08-21 00:00:00,2011-09-17 00:00:00,271.103,16.0107,65.5,1.23571
6,2011-09-18 00:00:00,2011-10-15 00:00:00,314.117,15.8107,11.6,0.571429
7,2011-10-16 00:00:00,2011-11-12 00:00:00,279.749,11.8821,33.3,1.18929
8,2011-11-13 00:00:00,2011-12-10 00:00:00,264.508,8.30714,9.9,0.510714
9,2011-12-11 00:00:00,2012-01-07 00:00:00,153.256,7.31071,74.3,2.41071


Define a function that cleans the data.

In [19]:
clean_data_df = clean_data(data_df)

In [20]:
clean_data_df

Unnamed: 0,start,end,cycle_counts,avg_temp_c,total_rainfall_mm,avg_wet_hrs_per_day
0,2010-11-14 00:00:00,2010-12-11 00:00:00,208.174,2.44643,17.6,1.225
1,2010-12-12 00:00:00,2011-01-08 00:00:00,125.683,2.375,44.8,1.43571
2,2011-04-01 00:00:00,2011-04-30 00:00:00,279.636,13.6,1.8,0.13
3,2011-05-01 00:00:00,2011-05-28 00:00:00,312.09,14.2071,22.9,0.346429
4,2011-07-24 00:00:00,2011-08-20 00:00:00,298.191,17.6607,43.8,0.739286
5,2011-08-21 00:00:00,2011-09-17 00:00:00,271.103,16.0107,65.5,1.23571
6,2011-09-18 00:00:00,2011-10-15 00:00:00,314.117,15.8107,11.6,0.571429
7,2011-10-16 00:00:00,2011-11-12 00:00:00,279.749,11.8821,33.3,1.18929
8,2011-11-13 00:00:00,2011-12-10 00:00:00,264.508,8.30714,9.9,0.510714
9,2011-12-11 00:00:00,2012-01-07 00:00:00,153.256,7.31071,74.3,2.41071


In [21]:
clean_data_df.equals(cleaner_data)

True

## Plot

In [22]:
plot_seasonality(clean_data_df)

In [23]:
plot2d(clean_data_df)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



In [51]:
trace = go.Scatter(
    x = clean_data_df['avg_temp_c'],
    y = clean_data_df['cycle_counts'],
    mode = 'markers'
)

data = [trace]

layout = go.Layout(
    height=600,
    width=600,
    xaxis = dict(
        title='average temperature (C)'
    ),
    yaxis = dict(
        title='cycle counts'
    )
)

fig = go.Figure(data=data, layout=layout)

iplot(fig)

Plot the clycle flow against the total rainfall and the average temperature.

In [24]:
plot3d(clean_data_df)

## Train a linear regressor on the data and export the model

In [57]:
from sklearn.linear_model import LinearRegression

In [58]:
X = clean_data_df[['total_rainfall_mm', 'avg_temp_c']]

Y = np.array(clean_data_df['cycle_counts'])
Y = Y.reshape((len(Y),1))

In [59]:
lr = LinearRegression()

In [60]:
lr.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [61]:
from sklearn.externals import joblib

In [62]:
joblib.dump(lr, 'model.pkl')

['model.pkl']

## Create grid to plot the predictions from the trained model

In [32]:
xc = np.linspace(0, 120, 100)
yc = np.linspace(0, 20, 100)

In [33]:
xgrid, ygrid = np.meshgrid(xc, yc)

In [34]:
grid = np.vstack((xgrid.flatten(),ygrid.flatten())).T

In [35]:
lr.predict(grid)

array([[198.615916  ],
       [198.26446037],
       [197.91300474],
       ...,
       [327.34574624],
       [326.99429061],
       [326.64283498]])

In [36]:
cleaner_data['total_rainfall_mm'].shape

(36,)

In [37]:
trace1 = go.Scatter3d(
    x = cleaner_data['total_rainfall_mm'],
    y = cleaner_data['avg_temp_c'],
    z = cleaner_data['cycle_counts'],
    mode = 'markers',
    marker=dict(
        size=5
    ),
    name = 'data'
)

trace2 = go.Scatter3d(
    x = grid[:,0],
    y = grid[:,1],
    z = lr.predict(grid).flatten(),
    mode = 'markers',
    marker=dict(
        size=1,
        color='rgb(255,255,102)',
        opacity=0.6
    ),
    name = 'prediction'
)

data = [trace1, trace2]

new_rainfall = 80.0
new_temperature = 10.0
new_rainfall = np.array([new_rainfall])
new_temperature = np.array([new_temperature])

trace3 = go.Scatter3d(
    x = new_rainfall,
    y = new_temperature,
    z = lr.predict(np.vstack((new_rainfall, new_temperature)).T).flatten(),
    name = 'new prediction',
    mode = 'markers',
    marker=dict(
        size=5,
        color='red'
    )
)

data.append(trace3)

layout = go.Layout(
    margin = dict(
        t = 50,
        b = 50
    ),
    scene = dict(
        xaxis = dict(
            title = 'total rainfall (mm)'
        ),
        yaxis = dict(
            title = 'average temperature (C)'
        ),
        zaxis = dict(
            title = 'cycle counts'
        )
    )
)

fig = go.Figure(data=data, layout=layout)

iplot(fig)

In [38]:
plot_predictions(clean_data_df, grid, lr)

In [39]:
grid2 = create_grid()

In [40]:
plot_predictions(clean_data_df, grid2, lr)

In [41]:
from sklearn.metrics import r2_score

In [42]:
training_error = r2_score(Y, lr.predict(X))

In [43]:
print(f'Training error (r2 score): {training_error}')

Training error (r2 score): 0.6514488442532727


Save the trained model.

In [44]:
from sklearn.externals import joblib

In [45]:
joblib.dump(lr, 'model.pkl')

['model.pkl']

Reload the saved model.

In [46]:
lr2 = joblib.load('/project/london_cycle_flow/london-cycle-flow/model.pkl')

In [47]:
grid2 = create_grid()

In [48]:
plot_predictions(clean_data_df, grid2, lr2)