In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import ydata_profiling
from skrub import TableReport
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from vacances_scolaires_france import SchoolHolidayDates
from datetime import date
from jours_feries_france import JoursFeries

In [2]:
data = pd.read_parquet(Path("data") / "train.parquet")
data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.609438
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585


In [3]:
external_conditions = pd.read_csv('data/external_data.csv')
external_conditions

Unnamed: 0,numer_sta,date,pmer,tend,cod_tend,dd,ff,t,td,u,...,hnuage1,nnuage2,ctype2,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4
0,7149,2021-01-01 00:00:00,100810,80,1,270,1.8,272.75,272.15,96,...,600.0,,,,,,,,,
1,7149,2021-01-01 03:00:00,100920,110,3,300,1.7,271.25,270.95,98,...,1500.0,2.0,3.0,3000.0,,,,,,
2,7149,2021-01-01 06:00:00,100950,30,3,290,2.6,271.95,271.65,98,...,480.0,4.0,6.0,2000.0,6.0,3.0,3000.0,,,
3,7149,2021-01-01 09:00:00,101100,150,2,280,1.7,272.45,272.05,97,...,1740.0,3.0,3.0,2800.0,,,,,,
4,7149,2021-01-01 12:00:00,101110,30,0,50,1.0,276.95,274.15,82,...,330.0,4.0,6.0,570.0,7.0,6.0,810.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,7149,2020-09-30 09:00:00,101540,-30,8,230,4.4,289.95,286.85,82,...,400.0,7.0,6.0,2200.0,,,,,,
3318,7149,2020-09-30 12:00:00,101320,-210,8,190,4.9,292.05,285.55,66,...,870.0,7.0,6.0,1900.0,,,,,,
3319,7149,2020-09-30 15:00:00,101140,-180,7,190,4.1,291.55,286.45,72,...,820.0,7.0,6.0,2200.0,,,,,,
3320,7149,2020-09-30 18:00:00,101020,-130,6,190,2.7,290.15,285.25,73,...,2160.0,,,,,,,,,


In [4]:
test_data = pd.read_parquet(Path("data") / "final_test.parquet")
test_data

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 01:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 13:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 17:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 19:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 22:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
...,...,...,...,...,...,...,...,...,...,...
51435,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18 11:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980
51436,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18 15:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980
51437,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18 17:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980
51438,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18 18:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980


In [5]:
# Dictionary mapping column names to their practical explanations
column_name_mapping = {
    'numer_sta': 'Station Number',
    'date': 'Date and Time',
    'pmer': 'Sea Level Pressure (hPa)',
    'tend': 'Pressure Tendency (hPa/3h)',
    'cod_tend': 'Pressure Tendency Code',
    'dd': 'Wind Direction (°)',
    'ff': 'Wind Speed (m/s)',
    't': 'Air Temperature (°C)',
    'td': 'Dew Point Temperature (°C)',
    'u': 'Relative Humidity (%)',
    'vv': 'Visibility (m)',
    'ww': 'Present Weather Code',
    'w1': 'Past Weather Code 1',
    'w2': 'Past Weather Code 2',
    'n': 'Total Cloud Cover (oktas)',
    'nbas': 'Cloud Base Height (m)',
    'hbas': 'Lowest Cloud Base Height (m)',
    'cl': 'Low Cloud Type',
    'cm': 'Medium Cloud Type',
    'ch': 'High Cloud Type',
    'pres': 'Station Level Pressure (hPa)',
    'niv_bar': 'Barometer Altitude (m)',
    'geop': 'Geopotential Height (m)',
    'tend24': '24h Pressure Tendency (hPa)',
    'tn12': '12h Minimum Temperature (°C)',
    'tn24': '24h Minimum Temperature (°C)',
    'tx12': '12h Maximum Temperature (°C)',
    'tx24': '24h Maximum Temperature (°C)',
    'tminsol': 'Minimum Soil Temperature (°C)',
    'sw': 'Sunshine Duration (hours)',
    'tw': 'Wet Bulb Temperature (°C)',
    'raf10': '10min Max Wind Gust (m/s)',
    'rafper': 'Max Wind Gust (m/s)',
    'per': 'Measurement Period Duration',
    'etat_sol': 'Ground State',
    'ht_neige': 'Snow Height (cm)',
    'ssfrai': 'New Snow Depth (cm)',
    'perssfrai': 'New Snowfall Duration (hours)',
    'rr1': 'Rainfall (1h, mm)',
    'rr3': 'Rainfall (3h, mm)',
    'rr6': 'Rainfall (6h, mm)',
    'rr12': 'Rainfall (12h, mm)',
    'rr24': 'Rainfall (24h, mm)',
    'phenspe1': 'Special Weather Phenomenon 1',
    'phenspe2': 'Special Weather Phenomenon 2',
    'phenspe3': 'Special Weather Phenomenon 3',
    'phenspe4': 'Special Weather Phenomenon 4',
    'nnuage1': 'Layer 1 Cloud Cover (oktas)',
    'ctype1': 'Layer 1 Cloud Type',
    'hnuage1': 'Layer 1 Cloud Base Height (m)',
    'nnuage2': 'Layer 2 Cloud Cover (oktas)',
    'ctype2': 'Layer 2 Cloud Type',
    'hnuage2': 'Layer 2 Cloud Base Height (m)',
    'nnuage3': 'Layer 3 Cloud Cover (oktas)',
    'ctype3': 'Layer 3 Cloud Type',
    'hnuage3': 'Layer 3 Cloud Base Height (m)',
    'nnuage4': 'Layer 4 Cloud Cover (oktas)',
    'ctype4': 'Layer 4 Cloud Type',
    'hnuage4': 'Layer 4 Cloud Base Height (m)',
}

# Rename columns in the DataFrame
external_conditions = external_conditions.rename(columns=column_name_mapping)


In [39]:
threshold = len(external_conditions) * 0.8
external_conditions = external_conditions.loc[:, external_conditions.isnull().sum() < threshold]
TableReport(external_conditions)

Processing column   1 / 42

Processing column  42 / 42


Unnamed: 0_level_0,Station Number,Date and Time,Sea Level Pressure (hPa),Pressure Tendency (hPa/3h),Pressure Tendency Code,Wind Direction (°),Wind Speed (m/s),Air Temperature (°C),Dew Point Temperature (°C),Relative Humidity (%),Visibility (m),Present Weather Code,Past Weather Code 1,Past Weather Code 2,Total Cloud Cover (oktas),Cloud Base Height (m),Lowest Cloud Base Height (m),Low Cloud Type,Medium Cloud Type,High Cloud Type,Station Level Pressure (hPa),24h Pressure Tendency (hPa),12h Minimum Temperature (°C),12h Maximum Temperature (°C),10min Max Wind Gust (m/s),Max Wind Gust (m/s),Measurement Period Duration,Ground State,Snow Height (cm),New Snow Depth (cm),New Snowfall Duration (hours),"Rainfall (1h, mm)","Rainfall (3h, mm)","Rainfall (6h, mm)","Rainfall (12h, mm)","Rainfall (24h, mm)",Layer 1 Cloud Cover (oktas),Layer 1 Cloud Type,Layer 1 Cloud Base Height (m),Layer 2 Cloud Cover (oktas),Layer 2 Cloud Type,Layer 2 Cloud Base Height (m)
Unnamed: 0_level_1,Station Number,Date and Time,Sea Level Pressure (hPa),Pressure Tendency (hPa/3h),Pressure Tendency Code,Wind Direction (°),Wind Speed (m/s),Air Temperature (°C),Dew Point Temperature (°C),Relative Humidity (%),Visibility (m),Present Weather Code,Past Weather Code 1,Past Weather Code 2,Total Cloud Cover (oktas),Cloud Base Height (m),Lowest Cloud Base Height (m),Low Cloud Type,Medium Cloud Type,High Cloud Type,Station Level Pressure (hPa),24h Pressure Tendency (hPa),12h Minimum Temperature (°C),12h Maximum Temperature (°C),10min Max Wind Gust (m/s),Max Wind Gust (m/s),Measurement Period Duration,Ground State,Snow Height (cm),New Snow Depth (cm),New Snowfall Duration (hours),"Rainfall (1h, mm)","Rainfall (3h, mm)","Rainfall (6h, mm)","Rainfall (12h, mm)","Rainfall (24h, mm)",Layer 1 Cloud Cover (oktas),Layer 1 Cloud Type,Layer 1 Cloud Base Height (m),Layer 2 Cloud Cover (oktas),Layer 2 Cloud Type,Layer 2 Cloud Base Height (m)
0.0,7149.0,2021-01-01 00:00:00,100810.0,80.0,1.0,270.0,1.8,272.75,272.15,96.0,990.0,2.0,0.0,0.0,10.0,1.0,800.0,35.0,20.0,10.0,99680.0,470.0,,,2.5,2.5,-10.0,1.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,2.0,1.0,6.0,600.0,,,
1.0,7149.0,2021-01-01 03:00:00,100920.0,110.0,3.0,300.0,1.7,271.25,270.95,98.0,210.0,40.0,0.0,0.0,25.0,1.0,1750.0,35.0,23.0,10.0,99790.0,750.0,,,2.2,2.2,-10.0,1.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,1.2,1.0,6.0,1500.0,2.0,3.0,3000.0
2.0,7149.0,2021-01-01 06:00:00,100950.0,30.0,3.0,290.0,2.6,271.95,271.65,98.0,3660.0,3.0,1.0,1.0,90.0,5.0,450.0,35.0,27.0,,99820.0,900.0,270.75,275.25,3.2,3.2,-10.0,1.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,1.0,3.0,6.0,480.0,4.0,6.0,2000.0
3.0,7149.0,2021-01-01 09:00:00,101100.0,150.0,2.0,280.0,1.7,272.45,272.05,97.0,3500.0,10.0,1.0,1.0,50.0,1.0,1750.0,35.0,23.0,,99970.0,860.0,,,2.3,2.3,-10.0,13.0,0.01,0.01,-30.0,0.0,0.2,0.2,0.2,0.2,1.0,6.0,1740.0,3.0,3.0,2800.0
4.0,7149.0,2021-01-01 12:00:00,101110.0,30.0,0.0,50.0,1.0,276.95,274.15,82.0,8000.0,2.0,2.0,2.0,90.0,7.0,450.0,38.0,,,100000.0,790.0,,,2.5,4.4,-10.0,11.0,-0.01,0.0,-60.0,0.0,0.0,0.2,0.2,0.2,1.0,8.0,330.0,4.0,6.0,570.0
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3317.0,7149.0,2020-09-30 09:00:00,101540.0,-30.0,8.0,230.0,4.4,289.95,286.85,82.0,18000.0,3.0,2.0,2.0,90.0,7.0,450.0,35.0,,,100480.0,-330.0,,,7.1,7.1,-10.0,0.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,2.2,5.0,6.0,400.0,7.0,6.0,2200.0
3318.0,7149.0,2020-09-30 12:00:00,101320.0,-210.0,8.0,190.0,4.9,292.05,285.55,66.0,25000.0,1.0,2.0,2.0,90.0,7.0,800.0,38.0,,,100270.0,-560.0,,,7.2,7.2,-10.0,0.0,0.0,0.0,-60.0,0.0,0.2,0.2,0.2,1.6,2.0,8.0,870.0,7.0,6.0,1900.0
3319.0,7149.0,2020-09-30 15:00:00,101140.0,-180.0,7.0,190.0,4.1,291.55,286.45,72.0,25000.0,1.0,2.0,2.0,90.0,7.0,800.0,38.0,,,100090.0,-650.0,,,6.4,8.4,-10.0,0.0,0.0,0.0,-30.0,0.0,0.0,0.2,0.2,0.2,2.0,8.0,820.0,7.0,6.0,2200.0
3320.0,7149.0,2020-09-30 18:00:00,101020.0,-130.0,6.0,190.0,2.7,290.15,285.25,73.0,40820.0,3.0,2.0,2.0,100.0,8.0,2250.0,35.0,,,99960.0,-790.0,287.85,292.45,4.3,5.2,-10.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.2,0.2,8.0,6.0,2160.0,,,

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,Station Number,Int64DType,0 (0.0%),,7150.0,0.0,,,
1,Date and Time,DateTime64DType,0 (0.0%),,,,2020-09-01T00:00:00,,2021-10-21T12:00:00
2,Sea Level Pressure (hPa),Int64DType,0 (0.0%),,102000.0,924.0,97260,101820.0,103920
3,Pressure Tendency (hPa/3h),Int64DType,0 (0.0%),,-0.238,122.0,-750,0.0,720
4,Pressure Tendency Code,Int64DType,0 (0.0%),,4.27,2.71,0,3.0,8
5,Wind Direction (°),Int64DType,0 (0.0%),,184.0,104.0,0,200.0,360
6,Wind Speed (m/s),Float64DType,0 (0.0%),,3.65,2.0,0.00,3.4,14.6
7,Air Temperature (°C),Float64DType,0 (0.0%),,286.0,6.85,268.,286.0,307.
8,Dew Point Temperature (°C),Float64DType,0 (0.0%),,281.0,5.63,261.,281.0,293.
9,Relative Humidity (%),Int64DType,0 (0.0%),,74.8,17.0,24,79.0,100

Column 1,Column 2,Cramér's V
12h Minimum Temperature (°C),12h Maximum Temperature (°C),1.0
Lowest Cloud Base Height (m),Layer 1 Cloud Base Height (m),0.992
Sea Level Pressure (hPa),Station Level Pressure (hPa),0.965
Wind Speed (m/s),10min Max Wind Gust (m/s),0.731
Past Weather Code 1,Past Weather Code 2,0.723
New Snow Depth (cm),New Snowfall Duration (hours),0.707
Low Cloud Type,New Snowfall Duration (hours),0.678
10min Max Wind Gust (m/s),Max Wind Gust (m/s),0.662
"Rainfall (3h, mm)","Rainfall (6h, mm)",0.603
"Rainfall (12h, mm)","Rainfall (24h, mm)",0.594


In [40]:
TableReport(data)

Processing column  12 / 12


Unnamed: 0_level_0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count
Unnamed: 0_level_1,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count
48321.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48324.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.6931471805599453
48327.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48330.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.6094379124341005
48333.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585092994046
,,,,,,,,,,,,
929175.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,445.0,2021-09-09 06:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198,6.100318952020064
929178.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,145.0,2021-09-09 10:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198,4.983606621708336
929181.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,218.0,2021-09-09 15:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198,5.389071729816501
929184.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,21.0,2021-09-09 22:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198,3.091042453358316

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,counter_id,CategoricalDtype,0 (0.0%),56 (< 0.1%),,,,,
1,counter_name,CategoricalDtype,0 (0.0%),56 (< 0.1%),,,,,
2,site_id,Int64DType,0 (0.0%),,105000000.0,32100000.0,100007049,100056226.0,300014702
3,site_name,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
4,bike_count,Float64DType,0 (0.0%),,60.2,87.6,0.00,29.0,1.30e+03
5,date,DateTime64DType,0 (0.0%),,,,2020-09-01T01:00:00,,2021-09-09T23:00:00
6,counter_installation_date,DateTime64DType,0 (0.0%),,,,2013-01-18T00:00:00,,2020-11-29T00:00:00
7,coordinates,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
8,counter_technical_id,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
9,latitude,Float64DType,0 (0.0%),,48.9,0.0186,48.8,48.9,48.9

Column 1,Column 2,Cramér's V
counter_id,counter_name,1.0
coordinates,counter_technical_id,1.0
site_name,coordinates,0.943
site_name,counter_technical_id,0.943
site_id,coordinates,0.763
site_id,counter_technical_id,0.763
site_id,site_name,0.737
site_id,counter_installation_date,0.659
counter_technical_id,longitude,0.646
coordinates,longitude,0.646


In [41]:
TableReport(test_data)

Processing column  10 / 10


Unnamed: 0_level_0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
Unnamed: 0_level_1,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
0.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,2021-09-10 01:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
1.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,2021-09-10 13:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
2.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,2021-09-10 17:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
3.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,2021-09-10 19:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
4.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,2021-09-10 22:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
,,,,,,,,,,
51435.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,2021-10-18 11:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198
51436.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,2021-10-18 15:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198
51437.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,2021-10-18 17:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198
51438.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,2021-10-18 18:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,counter_id,CategoricalDtype,0 (0.0%),56 (0.1%),,,,,
1,counter_name,CategoricalDtype,0 (0.0%),56 (0.1%),,,,,
2,site_id,Int64DType,0 (0.0%),,107000000.0,37400000.0,100007049,100056327.0,300014702
3,site_name,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
4,date,DateTime64DType,0 (0.0%),,,,2021-09-10T01:00:00,,2021-10-18T21:00:00
5,counter_installation_date,DateTime64DType,0 (0.0%),,,,2013-01-18T00:00:00,,2020-11-29T00:00:00
6,coordinates,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
7,counter_technical_id,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
8,latitude,Float64DType,0 (0.0%),,48.9,0.0186,48.8,48.9,48.9
9,longitude,Float64DType,0 (0.0%),,2.34,0.0383,2.27,2.35,2.41

Column 1,Column 2,Cramér's V
site_name,coordinates,1.0
site_name,counter_technical_id,1.0
coordinates,counter_technical_id,1.0
counter_id,counter_name,0.943
counter_name,counter_technical_id,0.694
counter_name,coordinates,0.694
counter_name,site_name,0.694
site_id,counter_technical_id,0.693
site_id,site_name,0.693
site_id,coordinates,0.693


In [32]:
# Convert 'Date and Time' column in external_conditions to datetime
external_conditions['Date and Time'] = pd.to_datetime(external_conditions['Date and Time'])

# Merge the dataframes
merged_data = pd.merge(data, external_conditions, left_on='date', right_on='Date and Time', how='left')
test_merged_data = pd.merge(test_data, external_conditions, left_on='date', right_on='Date and Time', how='left')

# Display the merged dataframe
test_merged_data

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,...,"Rainfall (3h, mm)","Rainfall (6h, mm)","Rainfall (12h, mm)","Rainfall (24h, mm)",Layer 1 Cloud Cover (oktas),Layer 1 Cloud Type,Layer 1 Cloud Base Height (m),Layer 2 Cloud Cover (oktas),Layer 2 Cloud Type,Layer 2 Cloud Base Height (m)
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 01:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,...,,,,,,,,,,
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 13:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,...,,,,,,,,,,
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 17:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,...,,,,,,,,,,
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 19:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,...,,,,,,,,,,
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 22:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51435,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18 11:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,...,,,,,,,,,,
51436,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18 15:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,...,0.0,0.0,0.0,0.0,5.0,6.0,720.0,4.0,0.0,6400.0
51437,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18 17:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,...,,,,,,,,,,
51438,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18 18:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,...,-0.1,-0.1,-0.1,-0.1,4.0,6.0,900.0,7.0,4.0,3000.0


In [8]:
d = SchoolHolidayDates()

In [33]:
# Ensure "Date and Time" is in datetime format
merged_data["Date and Time"] = pd.to_datetime(merged_data["Date and Time"], errors="coerce")

# Check for missing or invalid datetime entries
if merged_data["Date and Time"].isnull().any():
    print("Warning: Missing or invalid datetime entries found.")
    # Handle missing values if needed
    merged_data = merged_data.dropna(subset=["Date and Time"])

# Extract date and time features
merged_data["measurement_date"] = merged_data["Date and Time"].dt.date
merged_data["measurement_year"] = merged_data["Date and Time"].dt.year
merged_data["measurement_month"] = merged_data["Date and Time"].dt.month
merged_data["measurement_day_of_week"] = merged_data["Date and Time"].dt.dayofweek
merged_data["measurement_day"] = merged_data["Date and Time"].dt.day
merged_data["measurement_hour"] = merged_data["Date and Time"].dt.hour

# Determine if the day is a weekend
merged_data["measurement_is_weekend"] = np.where(
    merged_data["measurement_day_of_week"] >= 5, 1, 0
)

# Handle school holidays
unique_dates = merged_data["measurement_date"].unique()

# Example holiday mapping function
d = JoursFeries()
try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    merged_data["is_school_holiday"] = merged_data["measurement_date"].map(
        dict_school_holidays
    )
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    merged_data["is_school_holiday"] = 0  # Fallback to default value

# Handle public holidays
f = JoursFeries()
try:
    dict_public_holidays = {
        date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates
    }
    merged_data["is_public_holiday"] = merged_data["measurement_date"].map(
        dict_public_holidays
    )
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    merged_data["is_public_holiday"] = 0  # Fallback to default value

# Extract additional date and time features for the counter
merged_data["counter_year"] = merged_data["Date and Time"].dt.year
merged_data["counter_month"] = merged_data["Date and Time"].dt.month
merged_data["counter_day"] = merged_data["Date and Time"].dt.day
merged_data["counter_hour"] = merged_data["Date and Time"].dt.hour

# Check the first rows for a specific date
print(merged_data[merged_data["measurement_date"].astype("str") == "2021-01-01"].head())


Error with school holidays mapping: 'JoursFeries' object has no attribute 'is_holiday_for_zone'
               counter_id              counter_name    site_id  \
2546  100007049-102007049  28 boulevard Diderot E-O  100007049   
2549  100007049-102007049  28 boulevard Diderot E-O  100007049   
2555  100007049-102007049  28 boulevard Diderot E-O  100007049   
2745  100007049-102007049  28 boulevard Diderot E-O  100007049   
3517  100007049-102007049  28 boulevard Diderot E-O  100007049   

                 site_name  bike_count                date  \
2546  28 boulevard Diderot         1.0 2021-01-01 09:00:00   
2549  28 boulevard Diderot         0.0 2021-01-01 15:00:00   
2555  28 boulevard Diderot         1.0 2021-01-01 21:00:00   
2745  28 boulevard Diderot         2.0 2021-01-01 18:00:00   
3517  28 boulevard Diderot         0.0 2021-01-01 00:00:00   

     counter_installation_date         coordinates counter_technical_id  \
2546                2013-01-18  48.846028,2.375429         

In [34]:
# Ensure "Date and Time" is in datetime format
test_merged_data["Date and Time"] = pd.to_datetime(test_merged_data["Date and Time"], errors="coerce")

# Extract date and time features
test_merged_data["measurement_date"] = test_merged_data["Date and Time"].dt.date
test_merged_data["measurement_year"] = test_merged_data["Date and Time"].dt.year
test_merged_data["measurement_month"] = test_merged_data["Date and Time"].dt.month
test_merged_data["measurement_day_of_week"] = test_merged_data["Date and Time"].dt.dayofweek
test_merged_data["measurement_day"] = test_merged_data["Date and Time"].dt.day
test_merged_data["measurement_hour"] = test_merged_data["Date and Time"].dt.hour

# Determine if the day is a weekend
test_merged_data["measurement_is_weekend"] = np.where(
    test_merged_data["measurement_day_of_week"] >= 5, 1, 0
)

# Handle school holidays
unique_dates = test_merged_data["measurement_date"].unique()

# Example holiday mapping function
d = SchoolHolidayDates()
try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    test_merged_data["is_school_holiday"] = test_merged_data["measurement_date"].map(
        dict_school_holidays
    )
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    test_merged_data["is_school_holiday"] = 0  # Fallback to default value

# Handle public holidays
f = JoursFeries()
try:
    dict_public_holidays = {
        date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates
    }
    test_merged_data["is_public_holiday"] = test_merged_data["measurement_date"].map(
        dict_public_holidays
    )
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    test_merged_data["is_public_holiday"] = 0  # Fallback to default value

# Extract additional date and time features for the counter
test_merged_data["counter_year"] = test_merged_data["Date and Time"].dt.year
test_merged_data["counter_month"] = test_merged_data["Date and Time"].dt.month
test_merged_data["counter_day"] = test_merged_data["Date and Time"].dt.day
test_merged_data["counter_hour"] = test_merged_data["Date and Time"].dt.hour

# Check the first rows for a specific date (if needed)
print(test_merged_data[test_merged_data["measurement_date"].astype("str") == "2021-01-01"].head())


Error with school holidays mapping: date should be a datetime.date
Empty DataFrame
Columns: [counter_id, counter_name, site_id, site_name, date, counter_installation_date, coordinates, counter_technical_id, latitude, longitude, Station Number, Date and Time, Sea Level Pressure (hPa), Pressure Tendency (hPa/3h), Pressure Tendency Code, Wind Direction (°), Wind Speed (m/s), Air Temperature (°C), Dew Point Temperature (°C), Relative Humidity (%), Visibility (m), Present Weather Code, Past Weather Code 1, Past Weather Code 2, Total Cloud Cover (oktas), Cloud Base Height (m), Lowest Cloud Base Height (m), Low Cloud Type, Medium Cloud Type, High Cloud Type, Station Level Pressure (hPa), 24h Pressure Tendency (hPa), 12h Minimum Temperature (°C), 12h Maximum Temperature (°C), 10min Max Wind Gust (m/s), Max Wind Gust (m/s), Measurement Period Duration, Ground State, Snow Height (cm), New Snow Depth (cm), New Snowfall Duration (hours), Rainfall (1h, mm), Rainfall (3h, mm), Rainfall (6h, mm), Rai

In [35]:
TableReport(test_merged_data)

Processing column  65 / 65


Unnamed: 0_level_0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,Station Number,Date and Time,Sea Level Pressure (hPa),Pressure Tendency (hPa/3h),Pressure Tendency Code,Wind Direction (°),Wind Speed (m/s),Air Temperature (°C),Dew Point Temperature (°C),Relative Humidity (%),Visibility (m),Present Weather Code,Past Weather Code 1,Past Weather Code 2,Total Cloud Cover (oktas),Cloud Base Height (m),Lowest Cloud Base Height (m),Low Cloud Type,Medium Cloud Type,High Cloud Type,Station Level Pressure (hPa),24h Pressure Tendency (hPa),12h Minimum Temperature (°C),12h Maximum Temperature (°C),10min Max Wind Gust (m/s),Max Wind Gust (m/s),Measurement Period Duration,Ground State,Snow Height (cm),New Snow Depth (cm),New Snowfall Duration (hours),"Rainfall (1h, mm)","Rainfall (3h, mm)","Rainfall (6h, mm)","Rainfall (12h, mm)","Rainfall (24h, mm)",Layer 1 Cloud Cover (oktas),Layer 1 Cloud Type,Layer 1 Cloud Base Height (m),Layer 2 Cloud Cover (oktas),Layer 2 Cloud Type,Layer 2 Cloud Base Height (m),measurement_date,measurement_year,measurement_month,measurement_day_of_week,measurement_day,measurement_hour,measurement_is_weekend,is_school_holiday,is_public_holiday,counter_year,counter_month,counter_day,counter_hour
Unnamed: 0_level_1,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,Station Number,Date and Time,Sea Level Pressure (hPa),Pressure Tendency (hPa/3h),Pressure Tendency Code,Wind Direction (°),Wind Speed (m/s),Air Temperature (°C),Dew Point Temperature (°C),Relative Humidity (%),Visibility (m),Present Weather Code,Past Weather Code 1,Past Weather Code 2,Total Cloud Cover (oktas),Cloud Base Height (m),Lowest Cloud Base Height (m),Low Cloud Type,Medium Cloud Type,High Cloud Type,Station Level Pressure (hPa),24h Pressure Tendency (hPa),12h Minimum Temperature (°C),12h Maximum Temperature (°C),10min Max Wind Gust (m/s),Max Wind Gust (m/s),Measurement Period Duration,Ground State,Snow Height (cm),New Snow Depth (cm),New Snowfall Duration (hours),"Rainfall (1h, mm)","Rainfall (3h, mm)","Rainfall (6h, mm)","Rainfall (12h, mm)","Rainfall (24h, mm)",Layer 1 Cloud Cover (oktas),Layer 1 Cloud Type,Layer 1 Cloud Base Height (m),Layer 2 Cloud Cover (oktas),Layer 2 Cloud Type,Layer 2 Cloud Base Height (m),measurement_date,measurement_year,measurement_month,measurement_day_of_week,measurement_day,measurement_hour,measurement_is_weekend,is_school_holiday,is_public_holiday,counter_year,counter_month,counter_day,counter_hour
0.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,2021-09-10 01:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,False,,,,
1.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,2021-09-10 13:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,False,,,,
2.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,2021-09-10 17:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,False,,,,
3.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,2021-09-10 19:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,False,,,,
4.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,2021-09-10 22:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,False,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
51435.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,2021-10-18 11:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,False,,,,
51436.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,2021-10-18 15:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198,7149.0,2021-10-18 15:00:00,102100.0,-40.0,5.0,200.0,5.2,291.55,286.65,73.0,35350.0,3.0,2.0,2.0,90.0,2.0,800.0,35.0,20.0,12.0,101040.0,150.0,,,7.7,8.1,-10.0,0.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,0.0,5.0,6.0,720.0,4.0,0.0,6400.0,2021-10-18,2021.0,10.0,0.0,18.0,15.0,0.0,0.0,False,2021.0,10.0,18.0,15.0
51437.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,2021-10-18 17:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,False,,,,
51438.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,2021-10-18 18:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198,7149.0,2021-10-18 18:00:00,102150.0,40.0,3.0,220.0,1.9,290.05,287.15,83.0,20000.0,61.0,6.0,2.0,100.0,4.0,800.0,35.0,22.0,,101080.0,120.0,278.75,291.75,2.7,5.1,-10.0,0.0,0.0,0.0,-60.0,-0.1,-0.1,-0.1,-0.1,-0.1,4.0,6.0,900.0,7.0,4.0,3000.0,2021-10-18,2021.0,10.0,0.0,18.0,18.0,0.0,0.0,False,2021.0,10.0,18.0,18.0

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,counter_id,CategoricalDtype,0 (0.0%),56 (0.1%),,,,,
1,counter_name,CategoricalDtype,0 (0.0%),56 (0.1%),,,,,
2,site_id,Int64DType,0 (0.0%),,107000000.0,37400000.0,100007049,100056327.0,300014702
3,site_name,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
4,date,DateTime64DType,0 (0.0%),,,,2021-09-10T01:00:00,,2021-10-18T21:00:00
5,counter_installation_date,DateTime64DType,0 (0.0%),,,,2013-01-18T00:00:00,,2020-11-29T00:00:00
6,coordinates,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
7,counter_technical_id,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
8,latitude,Float64DType,0 (0.0%),,48.9,0.0186,48.8,48.9,48.9
9,longitude,Float64DType,0 (0.0%),,2.34,0.0383,2.27,2.35,2.41

Column 1,Column 2,Cramér's V
counter_id,counter_name,1.0
Dew Point Temperature (°C),10min Max Wind Gust (m/s),1.0
Dew Point Temperature (°C),Past Weather Code 1,1.0
Dew Point Temperature (°C),Past Weather Code 2,1.0
Dew Point Temperature (°C),Cloud Base Height (m),1.0
Measurement Period Duration,counter_year,1.0
Measurement Period Duration,counter_month,1.0
Measurement Period Duration,counter_day,1.0
Measurement Period Duration,counter_hour,1.0
Sea Level Pressure (hPa),"Rainfall (1h, mm)",1.0


Decide to remove site id, site name and counter id to just keep counter name to reduce complexity and the data as they all provide more or less the same information. Counter is more precise as we will be able to calculate the number of times a counter is used in a given site.

## Model training with Elastic Net (To find the best features)

Elastic net can handle multicolinearity and shrinks the less important features to zero. It is a combination of L1 and L2 regularization. It is a linear regression model trained with L1 and L2 prior as regularizer. This combination allows for learning a sparse model where few of the weights are non-zero like Lasso, while still maintaining the regularization properties of Ridge.

In [None]:
# Define the features and target variable
X = merged_data.drop(columns=[
                            'bike_count', 'log_bike_count',
                            'counter_id', 'site_id', 'site_name', 'counter_technical_id',
                            'coordinates',
                            'Station Number', 'Measurement Period Duration',
                            'date', 'Date and Time', 'counter_installation_date',
                    ])



y = merged_data['log_bike_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer with OneHotEncoder for 'counter_name' and SimpleImputer for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['counter_name'])
    ])

# Create a pipeline with the preprocessor, standard scaler, and ElasticNet regression
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features

elasticnet_feature_importance = pd.Series(elasticnet_coefficients, index=feature_names).sort_values(ascending=False)

ElasticNet model score: 0.3180003902204265
ElasticNet model score: 0.3180003902204265
ElasticNet model score: 0.3180003902204265
ElasticNet model score: 0.3180003902204265


In [13]:
# Print the feature importances
print(elasticnet_feature_importance)

Air Temperature (°C)                                 0.296738
counter_name_Totem 73 boulevard de Sébastopol S-N    0.146830
counter_name_Totem 73 boulevard de Sébastopol N-S    0.121242
counter_name_Totem 64 Rue de Rivoli O-E              0.101817
counter_name_67 boulevard Voltaire SE-NO             0.079850
                                                       ...   
New Snowfall Duration (hours)                       -0.087188
counter_name_Face au 40 quai D'Issy NE-SO           -0.139416
counter_name_Face au 40 quai D'Issy SO-NE           -0.166139
counter_name_28 boulevard Diderot E-O               -0.227536
Relative Humidity (%)                               -0.382548
Length: 99, dtype: float64


In [14]:
# Filter features with non-zero importance
non_zero_features = elasticnet_feature_importance[elasticnet_feature_importance != 0].index.tolist()
# Keep only the non-zero features in the merged dataset
# Keep all the variables apart from the columns which are derived from a one hot encoder
non_zero_features = [feature for feature in non_zero_features if not feature.startswith('counter_name_')]
merged_data_filtered = merged_data[['counter_name', 'bike_count', 'log_bike_count'] + non_zero_features]
test_merged_data_filtered = test_merged_data[['counter_name'] + non_zero_features]


# Display the new dataframe
merged_data_filtered

Unnamed: 0,counter_name,bike_count,log_bike_count,Air Temperature (°C),Total Cloud Cover (oktas),Low Cloud Type,10min Max Wind Gust (m/s),Max Wind Gust (m/s),latitude,Pressure Tendency (hPa/3h),24h Pressure Tendency (hPa),Present Weather Code,Layer 2 Cloud Cover (oktas),Visibility (m),Layer 1 Cloud Cover (oktas),measurement_is_weekend,New Snowfall Duration (hours),Relative Humidity (%)
1,28 boulevard Diderot E-O,1.0,0.693147,283.95,0.0,30.0,1.5,1.5,48.846028,-60.0,0.0,2.0,,25000.0,,0,-30.0,88.0
3,28 boulevard Diderot E-O,4.0,1.609438,293.65,60.0,38.0,7.5,7.5,48.846028,-110.0,-260.0,3.0,5.0,30000.0,2.0,0,-30.0,41.0
4,28 boulevard Diderot E-O,9.0,2.302585,292.15,90.0,38.0,6.5,6.5,48.846028,10.0,-210.0,2.0,7.0,30000.0,1.0,0,-60.0,47.0
12,28 boulevard Diderot E-O,7.0,2.079442,292.75,,,3.7,5.1,48.846028,60.0,220.0,0.0,,49310.0,,0,,44.0
17,28 boulevard Diderot E-O,12.0,2.564949,300.35,75.0,35.0,7.7,7.9,48.846028,-50.0,140.0,3.0,5.0,30000.0,3.0,0,-30.0,36.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496866,254 rue de Vaugirard SO-NE,211.0,5.356586,302.95,40.0,30.0,11.4,12.5,48.839770,-220.0,-400.0,3.0,,12670.0,3.0,0,-30.0,30.0
496867,254 rue de Vaugirard SO-NE,156.0,5.056246,293.55,90.0,39.0,4.9,12.1,48.839770,60.0,150.0,25.0,2.0,48430.0,4.0,0,-60.0,86.0
496870,254 rue de Vaugirard SO-NE,440.0,6.089045,290.45,40.0,30.0,3.7,3.7,48.839770,-90.0,-800.0,3.0,,19860.0,3.0,0,-60.0,70.0
496876,254 rue de Vaugirard SO-NE,445.0,6.100319,292.85,100.0,32.0,4.3,5.4,48.839770,-10.0,-230.0,10.0,,5000.0,7.0,0,-60.0,95.0


In [15]:
# Subtract 273 from all values in the "Air Temperature (°C)" column
merged_data_filtered.loc[:,'Air Temperature (°C)'] -= 273
test_merged_data_filtered.loc[:,'Air Temperature (°C)'] -= 273
merged_data_filtered
test_merged_data_filtered

Unnamed: 0,counter_name,Air Temperature (°C),Total Cloud Cover (oktas),Low Cloud Type,10min Max Wind Gust (m/s),Max Wind Gust (m/s),latitude,Pressure Tendency (hPa/3h),24h Pressure Tendency (hPa),Present Weather Code,Layer 2 Cloud Cover (oktas),Visibility (m),Layer 1 Cloud Cover (oktas),measurement_is_weekend,New Snowfall Duration (hours),Relative Humidity (%)
5,28 boulevard Diderot E-O,16.65,60.0,35.0,3.4,3.7,48.846028,120.0,610.0,3.0,5.0,18000.0,1.0,1,-60.0,93.0
7,28 boulevard Diderot E-O,15.45,60.0,35.0,2.7,3.7,48.846028,10.0,590.0,3.0,5.0,18000.0,2.0,1,-30.0,89.0
9,28 boulevard Diderot E-O,13.45,10.0,35.0,3.1,3.4,48.846028,60.0,600.0,3.0,1.0,19070.0,1.0,1,-60.0,95.0
11,28 boulevard Diderot E-O,21.45,75.0,38.0,6.4,7.8,48.846028,-40.0,560.0,3.0,5.0,30000.0,4.0,1,-60.0,57.0
13,28 boulevard Diderot E-O,19.95,75.0,38.0,2.8,4.0,48.846028,-60.0,340.0,1.0,3.0,25000.0,1.0,1,-60.0,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51428,254 rue de Vaugirard SO-NE,15.25,90.0,30.0,5.7,6.1,48.839770,-40.0,-160.0,2.0,,25000.0,7.0,1,-60.0,58.0
51429,254 rue de Vaugirard SO-NE,15.35,100.0,38.0,5.6,5.6,48.839770,0.0,60.0,61.0,7.0,10000.0,1.0,0,-30.0,95.0
51434,254 rue de Vaugirard SO-NE,5.75,50.0,30.0,1.3,1.5,48.839770,70.0,160.0,3.0,,10000.0,4.0,0,-60.0,95.0
51436,254 rue de Vaugirard SO-NE,18.55,90.0,35.0,7.7,8.1,48.839770,-40.0,150.0,3.0,4.0,35350.0,5.0,0,-30.0,73.0


In [16]:
from xgboost import XGBRegressor

# Define the features and target variable
X = merged_data_filtered.drop(columns=[
                            'bike_count', 'log_bike_count',
                    ])

y = merged_data_filtered['log_bike_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer with OneHotEncoder for 'counter_name' and SimpleImputer for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['counter_name'])
    ])

# Create a pipeline with the preprocessor, standard scaler (with_mean=False), and XGBRegressor
xgboostpipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),
    ('regressor', XGBRegressor())
])

# Fit the XGBRegressor pipeline on the training data
xgboostpipeline.fit(X_train, y_train)

# Print the score of the XGBRegressor model on the test data
print(f"XGBRegressor model score: {xgboostpipeline.score(X_test, y_test)}")

# Output information about the XGBRegressor model
xgboost_feature_importances = xgboostpipeline.named_steps['regressor'].feature_importances_

# Get feature names after preprocessing
feature_names = (xgboostpipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 xgboostpipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features


XGBRegressor model score: 0.7835752659752271


In [19]:
# Predict the log_bike_count for the test_merged_data_filtered dataframe
y_pred = xgboostpipeline.predict(test_merged_data_filtered)

# Display the dataframe with predictions
y_pred

array([ 0.504079  , -0.31362563,  0.7296307 , ...,  3.3853843 ,
        3.7569485 ,  2.8063936 ], dtype=float32)

In [31]:
submission = pd.DataFrame({
    'log_bike_count': y_pred
}).reset_index(drop=True)
submission.index.name = 'Id'

submission.to_csv('/Users/felix/Downloads/test.csv')

