# MAP 536 - Python for Data Science - Predicting Cyclist Traffic in Paris

## Prediction

### adding weather data

Import all necessary packages

In [47]:

import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit


Load datasets & set target

In [109]:
# Load training and testing datasets
train_data = pd.read_parquet(Path("data") / "train.parquet")
test_data = pd.read_parquet(Path("data") / "test.parquet")

train_data.drop(columns=['counter_name', 'site_name','counter_id', 'counter_installation_date', 'counter_technical_id', 'site_id'], inplace=True)
train_data.tail()

Unnamed: 0,bike_count,date,latitude,longitude,log_bike_count
928450,51.0,2021-08-08 18:00:00,48.83977,2.30198,3.951244
928453,1.0,2021-08-09 02:00:00,48.83977,2.30198,0.693147
928456,61.0,2021-08-09 08:00:00,48.83977,2.30198,4.127134
928459,44.0,2021-08-09 10:00:00,48.83977,2.30198,3.806662
928462,83.0,2021-08-09 17:00:00,48.83977,2.30198,4.430817


In [112]:
weather_data = pd.read_csv(Path("data") / "hourly-weather-data.csv")
weather_data = weather_data.drop(columns=['name', 'dew', 'precipprob', 'preciptype','uvindex','icon','stations', 'sealevelpressure', 'winddir', 'conditions', 'sealevelpressure', 'severerisk'])

weather_data.info()
weather_data.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20423 entries, 0 to 20422
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   datetime        20423 non-null  object 
 1   temp            20423 non-null  float64
 2   feelslike       20423 non-null  float64
 3   humidity        20423 non-null  float64
 4   precip          20423 non-null  float64
 5   snow            20419 non-null  float64
 6   snowdepth       20419 non-null  float64
 7   windgust        20267 non-null  float64
 8   windspeed       20423 non-null  float64
 9   cloudcover      20423 non-null  float64
 10  visibility      20419 non-null  float64
 11  solarradiation  20423 non-null  float64
 12  solarenergy     20423 non-null  float64
dtypes: float64(12), object(1)
memory usage: 2.0+ MB


Unnamed: 0,datetime,temp,feelslike,humidity,precip,snow,snowdepth,windgust,windspeed,cloudcover,visibility,solarradiation,solarenergy
20418,2022-04-30T19:00:00,16.4,16.4,41.4,0.0,0.0,0.0,42.2,16.1,24.2,22.0,366.0,1.3
20419,2022-04-30T20:00:00,14.8,14.8,42.28,0.0,0.0,0.0,42.9,18.5,15.8,21.7,164.0,0.6
20420,2022-04-30T21:00:00,13.2,13.2,49.26,0.0,0.0,0.0,35.3,15.9,15.8,23.5,23.0,0.1
20421,2022-04-30T22:00:00,12.3,12.3,52.47,0.0,0.0,0.0,31.9,12.7,32.6,22.7,0.0,0.0
20422,2022-04-30T23:00:00,11.6,11.6,52.26,0.0,0.0,0.0,36.7,17.8,46.7,24.6,0.0,0.0


In [118]:
train_data['date'] = pd.to_datetime(train_data['date'])
weather_data['datetime'] = pd.to_datetime(weather_data['datetime'])
weather_data.head()


Unnamed: 0,datetime,temp,feelslike,humidity,precip,snow,snowdepth,windgust,windspeed,cloudcover,visibility,solarradiation,solarenergy
0,2020-01-01 00:00:00,1.0,-1.8,92.21,0.0,0.0,0.0,14.8,8.8,21.3,3.9,0.0,0.0
1,2020-01-01 01:00:00,0.3,-1.4,93.99,0.0,0.0,0.0,14.5,5.3,87.5,1.7,0.0,0.0
2,2020-01-01 02:00:00,-0.7,-3.8,95.71,0.0,0.0,0.0,16.9,8.9,87.0,0.9,0.0,0.0
3,2020-01-01 03:00:00,-0.5,-3.4,97.66,0.0,0.0,0.0,19.9,8.2,80.0,0.9,0.0,0.0
4,2020-01-01 04:00:00,-0.1,-2.1,97.36,0.0,0.0,0.0,21.8,5.8,97.2,0.1,0.0,0.0


In [116]:
train_data.tail()

Unnamed: 0,bike_count,date,latitude,longitude,log_bike_count
928450,51.0,2021-08-08 18:00:00,48.83977,2.30198,3.951244
928453,1.0,2021-08-09 02:00:00,48.83977,2.30198,0.693147
928456,61.0,2021-08-09 08:00:00,48.83977,2.30198,4.127134
928459,44.0,2021-08-09 10:00:00,48.83977,2.30198,3.806662
928462,83.0,2021-08-09 17:00:00,48.83977,2.30198,4.430817


In [119]:
merged_train_data = pd.merge(train_data, weather_data, left_on='date', right_on='datetime', how='inner')
merged_train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455163 entries, 0 to 455162
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   bike_count      455163 non-null  float64       
 1   date            455163 non-null  datetime64[us]
 2   latitude        455163 non-null  float64       
 3   longitude       455163 non-null  float64       
 4   log_bike_count  455163 non-null  float64       
 5   datetime        455163 non-null  datetime64[ns]
 6   temp            455163 non-null  float64       
 7   feelslike       455163 non-null  float64       
 8   humidity        455163 non-null  float64       
 9   precip          455163 non-null  float64       
 10  snow            454997 non-null  float64       
 11  snowdepth       454997 non-null  float64       
 12  windgust        448063 non-null  float64       
 13  windspeed       455163 non-null  float64       
 14  cloudcover      455163 non-null  flo

In [120]:
merged_train_data.head()

Unnamed: 0,bike_count,date,latitude,longitude,log_bike_count,datetime,temp,feelslike,humidity,precip,snow,snowdepth,windgust,windspeed,cloudcover,visibility,solarradiation,solarenergy
0,0.0,2020-09-01 02:00:00,48.846028,2.375429,0.0,2020-09-01 02:00:00,13.3,13.3,77.67,0.0,0.0,0.0,13.6,7.0,10.0,26.4,0.0,0.0
1,2.0,2020-09-01 02:00:00,48.846028,2.375429,1.098612,2020-09-01 02:00:00,13.3,13.3,77.67,0.0,0.0,0.0,13.6,7.0,10.0,26.4,0.0,0.0
2,5.0,2020-09-01 02:00:00,48.83436,2.377,1.791759,2020-09-01 02:00:00,13.3,13.3,77.67,0.0,0.0,0.0,13.6,7.0,10.0,26.4,0.0,0.0
3,1.0,2020-09-01 02:00:00,48.83436,2.377,0.693147,2020-09-01 02:00:00,13.3,13.3,77.67,0.0,0.0,0.0,13.6,7.0,10.0,26.4,0.0,0.0
4,0.0,2020-09-01 02:00:00,48.85372,2.35702,0.0,2020-09-01 02:00:00,13.3,13.3,77.67,0.0,0.0,0.0,13.6,7.0,10.0,26.4,0.0,0.0
