In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import ydata_profiling
from skrub import TableReport
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from vacances_scolaires_france import SchoolHolidayDates
from datetime import date
from jours_feries_france import JoursFeries

In [2]:
import utils

In [3]:
data = pd.read_parquet(Path("data") / "train.parquet")
data

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.000000
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.000000
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.609438
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585
...,...,...,...,...,...,...,...,...,...,...,...,...
929175,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,445.0,2021-09-09 06:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,6.100319
929178,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,145.0,2021-09-09 10:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,4.983607
929181,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,218.0,2021-09-09 15:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,5.389072
929184,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,21.0,2021-09-09 22:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,3.091042


In [4]:
test_data = pd.read_parquet(Path("data") / "final_test.parquet")
test_data

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 01:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 13:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 17:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 19:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 22:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
...,...,...,...,...,...,...,...,...,...,...
51435,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18 11:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980
51436,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18 15:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980
51437,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18 17:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980
51438,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18 18:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980


In [5]:
external_conditions = pd.read_csv('data/external_data.csv')
external_conditions

Unnamed: 0,numer_sta,date,pmer,tend,cod_tend,dd,ff,t,td,u,...,hnuage1,nnuage2,ctype2,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4
0,7149,2021-01-01 00:00:00,100810,80,1,270,1.8,272.75,272.15,96,...,600.0,,,,,,,,,
1,7149,2021-01-01 03:00:00,100920,110,3,300,1.7,271.25,270.95,98,...,1500.0,2.0,3.0,3000.0,,,,,,
2,7149,2021-01-01 06:00:00,100950,30,3,290,2.6,271.95,271.65,98,...,480.0,4.0,6.0,2000.0,6.0,3.0,3000.0,,,
3,7149,2021-01-01 09:00:00,101100,150,2,280,1.7,272.45,272.05,97,...,1740.0,3.0,3.0,2800.0,,,,,,
4,7149,2021-01-01 12:00:00,101110,30,0,50,1.0,276.95,274.15,82,...,330.0,4.0,6.0,570.0,7.0,6.0,810.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,7149,2020-09-30 09:00:00,101540,-30,8,230,4.4,289.95,286.85,82,...,400.0,7.0,6.0,2200.0,,,,,,
3318,7149,2020-09-30 12:00:00,101320,-210,8,190,4.9,292.05,285.55,66,...,870.0,7.0,6.0,1900.0,,,,,,
3319,7149,2020-09-30 15:00:00,101140,-180,7,190,4.1,291.55,286.45,72,...,820.0,7.0,6.0,2200.0,,,,,,
3320,7149,2020-09-30 18:00:00,101020,-130,6,190,2.7,290.15,285.25,73,...,2160.0,,,,,,,,,


In [6]:
# Step 1: Sort the `external_conditions` DataFrame by the `date` column
external_conditions = external_conditions.sort_values(by='date')

# Step 2: Remove duplicate entries based on the `date` column
external_conditions = external_conditions.drop_duplicates(subset='date')

filled_external_conditions = utils._fill_dataframe(external_conditions)

  date_range = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='H')
  closest_value = non_nan_values.iloc[(non_nan_values - value).abs().argmin()]


In [7]:
filled_external_conditions = utils._column_rename(filled_external_conditions)

In [9]:
merged_data, test_merged_data = utils._merge_data_with_external_data(filled_external_conditions, data, test_data)

In [None]:
merged_data_with_dates = utils._process_datetime_features(merged_data)

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,...,measurement_day_of_week,measurement_day,measurement_hour,measurement_is_weekend,is_school_holiday,is_public_holiday,counter_year,counter_month,counter_day,counter_hour
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,1,1,2,0,False,False,2020,9,1,2
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,1,1,3,0,False,False,2020,9,1,3
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,1,1,4,0,False,False,2020,9,1,4
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,1,1,15,0,False,False,2020,9,1,15
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,1,1,18,0,False,False,2020,9,1,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496822,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,445.0,2021-09-09 06:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,3,9,6,0,False,False,2021,9,9,6
496823,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,145.0,2021-09-09 10:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,3,9,10,0,False,False,2021,9,9,10
496824,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,218.0,2021-09-09 15:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,3,9,15,0,False,False,2021,9,9,15
496825,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,21.0,2021-09-09 22:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,3,9,22,0,False,False,2021,9,9,22


In [11]:
TableReport(merged_data_with_dates)

Processing column  74 / 74


Unnamed: 0_level_0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count,Date and Time,Station Number,Sea Level Pressure (hPa),Pressure Tendency (hPa/3h),Pressure Tendency Code,Wind Direction (°),Wind Speed (m/s),Air Temperature (°C),Dew Point Temperature (°C),Relative Humidity (%),Visibility (m),Present Weather Code,Past Weather Code 1,Past Weather Code 2,Total Cloud Cover (oktas),Cloud Base Height (m),Lowest Cloud Base Height (m),Low Cloud Type,Medium Cloud Type,High Cloud Type,Station Level Pressure (hPa),24h Pressure Tendency (hPa),12h Minimum Temperature (°C),12h Maximum Temperature (°C),Minimum Soil Temperature (°C),10min Max Wind Gust (m/s),Max Wind Gust (m/s),Measurement Period Duration,Ground State,Snow Height (cm),New Snow Depth (cm),New Snowfall Duration (hours),"Rainfall (1h, mm)","Rainfall (3h, mm)","Rainfall (6h, mm)","Rainfall (12h, mm)","Rainfall (24h, mm)",Layer 1 Cloud Cover (oktas),Layer 1 Cloud Type,Layer 1 Cloud Base Height (m),Layer 2 Cloud Cover (oktas),Layer 2 Cloud Type,Layer 2 Cloud Base Height (m),Layer 3 Cloud Cover (oktas),Layer 3 Cloud Type,Layer 3 Cloud Base Height (m),Layer 4 Cloud Cover (oktas),Layer 4 Cloud Type,Layer 4 Cloud Base Height (m),measurement_date,measurement_year,measurement_month,measurement_day_of_week,measurement_day,measurement_hour,measurement_is_weekend,is_school_holiday,is_public_holiday,counter_year,counter_month,counter_day,counter_hour
Unnamed: 0_level_1,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count,Date and Time,Station Number,Sea Level Pressure (hPa),Pressure Tendency (hPa/3h),Pressure Tendency Code,Wind Direction (°),Wind Speed (m/s),Air Temperature (°C),Dew Point Temperature (°C),Relative Humidity (%),Visibility (m),Present Weather Code,Past Weather Code 1,Past Weather Code 2,Total Cloud Cover (oktas),Cloud Base Height (m),Lowest Cloud Base Height (m),Low Cloud Type,Medium Cloud Type,High Cloud Type,Station Level Pressure (hPa),24h Pressure Tendency (hPa),12h Minimum Temperature (°C),12h Maximum Temperature (°C),Minimum Soil Temperature (°C),10min Max Wind Gust (m/s),Max Wind Gust (m/s),Measurement Period Duration,Ground State,Snow Height (cm),New Snow Depth (cm),New Snowfall Duration (hours),"Rainfall (1h, mm)","Rainfall (3h, mm)","Rainfall (6h, mm)","Rainfall (12h, mm)","Rainfall (24h, mm)",Layer 1 Cloud Cover (oktas),Layer 1 Cloud Type,Layer 1 Cloud Base Height (m),Layer 2 Cloud Cover (oktas),Layer 2 Cloud Type,Layer 2 Cloud Base Height (m),Layer 3 Cloud Cover (oktas),Layer 3 Cloud Type,Layer 3 Cloud Base Height (m),Layer 4 Cloud Cover (oktas),Layer 4 Cloud Type,Layer 4 Cloud Base Height (m),measurement_date,measurement_year,measurement_month,measurement_day_of_week,measurement_day,measurement_hour,measurement_is_weekend,is_school_holiday,is_public_holiday,counter_year,counter_month,counter_day,counter_hour
0.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0,2020-09-01 02:00:00,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,283.65,291.45,284.15,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0,2020-09-01,2020.0,9.0,1.0,1.0,2.0,0.0,False,False,2020.0,9.0,1.0,2.0
1.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.6931471805599453,2020-09-01 03:00:00,7149.0,101990.0,-60.0,6.0,290.0,1.1,283.95,282.05,88.0,25000.0,2.0,0.0,0.0,0.0,0.0,800.0,30.0,20.0,10.0,100900.0,0.0,283.65,291.45,284.15,1.5,1.5,-10.0,0.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,0.0,3.0,8.0,810.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0,2020-09-01,2020.0,9.0,1.0,1.0,3.0,0.0,False,False,2020.0,9.0,1.0,3.0
2.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0,2020-09-01 04:00:00,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,283.65,291.45,284.15,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0,2020-09-01,2020.0,9.0,1.0,1.0,4.0,0.0,False,False,2020.0,9.0,1.0,4.0
3.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.6094379124341005,2020-09-01 15:00:00,7149.0,101740.0,-110.0,6.0,40.0,4.0,293.65,279.95,41.0,30000.0,3.0,2.0,2.0,60.0,5.0,1750.0,38.0,20.0,10.0,100690.0,-260.0,283.65,291.45,284.15,7.5,7.5,-10.0,0.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,0.0,2.0,8.0,1700.0,5.0,6.0,2300.0,1.0,0.0,8000.0,4.0,9.0,7800.0,2020-09-01,2020.0,9.0,1.0,1.0,15.0,0.0,False,False,2020.0,9.0,1.0,15.0
4.0,100007049-102007049,28 boulevard Diderot E-O,100007049.0,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18 00:00:00,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585092994046,2020-09-01 18:00:00,7149.0,101760.0,10.0,3.0,20.0,3.0,292.15,280.55,47.0,30000.0,2.0,2.0,2.0,90.0,7.0,1750.0,38.0,23.0,11.0,100700.0,-210.0,284.35,295.45,284.15,6.5,6.5,-10.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,1.0,8.0,1700.0,7.0,6.0,2400.0,1.0,0.0,8000.0,4.0,9.0,7800.0,2020-09-01,2020.0,9.0,1.0,1.0,18.0,0.0,False,False,2020.0,9.0,1.0,18.0
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
496822.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,445.0,2021-09-09 06:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198,6.100318952020064,2021-09-09 06:00:00,7149.0,101130.0,-10.0,5.0,240.0,2.3,292.85,292.05,95.0,5000.0,10.0,2.0,2.0,100.0,8.0,150.0,32.0,23.0,11.0,100080.0,-230.0,292.85,299.85,284.15,4.3,5.4,-10.0,1.0,0.0,0.0,-60.0,0.0,0.0,-0.1,1.2,1.2,7.0,7.0,120.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0,2021-09-09,2021.0,9.0,3.0,9.0,6.0,0.0,False,False,2021.0,9.0,9.0,6.0
496823.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,145.0,2021-09-09 10:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198,4.983606621708336,2021-09-09 10:00:00,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,283.65,291.45,284.15,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0,2021-09-09,2021.0,9.0,3.0,9.0,10.0,0.0,False,False,2021.0,9.0,9.0,10.0
496824.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,218.0,2021-09-09 15:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198,5.389071729816501,2021-09-09 15:00:00,7149.0,101070.0,-70.0,6.0,230.0,5.5,299.35,288.45,51.0,20000.0,2.0,2.0,2.0,75.0,5.0,1250.0,38.0,20.0,11.0,100050.0,80.0,283.65,291.45,284.15,8.5,9.7,-10.0,0.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,1.2,3.0,8.0,1380.0,4.0,6.0,1560.0,4.0,0.0,7500.0,4.0,9.0,7800.0,2021-09-09,2021.0,9.0,3.0,9.0,15.0,0.0,False,False,2021.0,9.0,9.0,15.0
496825.0,300014702-353245971,254 rue de Vaugirard SO-NE,300014702.0,254 rue de Vaugirard,21.0,2021-09-09 22:00:00,2020-11-29 00:00:00,"48.83977,2.30198",Y2H20114504,48.83977,2.30198,3.091042453358316,2021-09-09 22:00:00,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,283.65,291.45,284.15,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0,2021-09-09,2021.0,9.0,3.0,9.0,22.0,0.0,False,False,2021.0,9.0,9.0,22.0

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,counter_id,CategoricalDtype,0 (0.0%),56 (< 0.1%),,,,,
1,counter_name,CategoricalDtype,0 (0.0%),56 (< 0.1%),,,,,
2,site_id,Int64DType,0 (0.0%),,105000000.0,32100000.0,100007049,100056226.0,300014702
3,site_name,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
4,bike_count,Float64DType,0 (0.0%),,60.2,87.6,0.00,29.0,1.30e+03
5,date,DateTime64DType,0 (0.0%),,,,2020-09-01T01:00:00,,2021-09-09T23:00:00
6,counter_installation_date,DateTime64DType,0 (0.0%),,,,2013-01-18T00:00:00,,2020-11-29T00:00:00
7,coordinates,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
8,counter_technical_id,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
9,latitude,Float64DType,0 (0.0%),,48.9,0.0186,48.8,48.9,48.9

Column 1,Column 2,Cramér's V
measurement_day,counter_day,1.0
site_name,counter_technical_id,1.0
site_name,coordinates,1.0
date,Date and Time,1.0
coordinates,counter_technical_id,1.0
measurement_hour,counter_hour,1.0
measurement_day_of_week,measurement_is_weekend,1.0
measurement_month,counter_month,1.0
measurement_year,counter_year,1.0
Layer 4 Cloud Cover (oktas),Layer 4 Cloud Base Height (m),0.966


Decide to remove site id, site name and counter id to just keep counter name to reduce complexity and the data as they all provide more or less the same information. Counter is more precise as we will be able to calculate the number of times a counter is used in a given site.

## Model training with Elastic Net (To find the best features)

Elastic net can handle multicolinearity and shrinks the less important features to zero. It is a combination of L1 and L2 regularization. It is a linear regression model trained with L1 and L2 prior as regularizer. This combination allows for learning a sparse model where few of the weights are non-zero like Lasso, while still maintaining the regularization properties of Ridge.

In [None]:
# Define the features and target variable
X = merged_data.drop(columns=[
                            'bike_count', 'log_bike_count',
                            'counter_id', 'site_id', 'site_name', 'counter_technical_id',
                            'coordinates',
                            'Station Number', 'Measurement Period Duration',
                            'date', 'Date and Time', 'counter_installation_date',
                    ])



y = merged_data['log_bike_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer with OneHotEncoder for 'counter_name' and SimpleImputer for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['counter_name'])
    ])

# Create a pipeline with the preprocessor, standard scaler, and ElasticNet regression
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features

elasticnet_feature_importance = pd.Series(elasticnet_coefficients, index=feature_names).sort_values(ascending=False)

In [None]:
# Print the feature importances
print(elasticnet_feature_importance)

In [None]:
# Filter features with non-zero importance
non_zero_features = elasticnet_feature_importance[elasticnet_feature_importance != 0].index.tolist()
# Keep only the non-zero features in the merged dataset
# Keep all the variables apart from the columns which are derived from a one hot encoder
non_zero_features = [feature for feature in non_zero_features if not feature.startswith('counter_name_')]
merged_data_filtered = merged_data[['counter_name', 'bike_count', 'log_bike_count'] + non_zero_features]
test_merged_data_filtered = test_merged_data[['counter_name'] + non_zero_features]


# Display the new dataframe
merged_data_filtered

In [None]:
# Subtract 273 from all values in the "Air Temperature (°C)" column
merged_data_filtered.loc[:,'Air Temperature (°C)'] -= 273
test_merged_data_filtered.loc[:,'Air Temperature (°C)'] -= 273
merged_data_filtered
test_merged_data_filtered

In [None]:
from xgboost import XGBRegressor

# Define the features and target variable
X = merged_data_filtered.drop(columns=[
                            'bike_count', 'log_bike_count',
                    ])

y = merged_data_filtered['log_bike_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer with OneHotEncoder for 'counter_name' and SimpleImputer for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['counter_name'])
    ])

# Create a pipeline with the preprocessor, standard scaler (with_mean=False), and XGBRegressor
xgboostpipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),
    ('regressor', XGBRegressor())
])

# Fit the XGBRegressor pipeline on the training data
xgboostpipeline.fit(X_train, y_train)

# Print the score of the XGBRegressor model on the test data
print(f"XGBRegressor model score: {xgboostpipeline.score(X_test, y_test)}")

# Output information about the XGBRegressor model
xgboost_feature_importances = xgboostpipeline.named_steps['regressor'].feature_importances_

# Get feature names after preprocessing
feature_names = (xgboostpipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 xgboostpipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features


In [None]:
# Predict the log_bike_count for the test_merged_data_filtered dataframe
y_pred = xgboostpipeline.predict(test_merged_data_filtered)

# Display the dataframe with predictions
y_pred

In [None]:
submission = pd.DataFrame({
    'log_bike_count': y_pred
}).reset_index(drop=True)
submission.index.name = 'Id'

submission.to_csv('/Users/felix/Downloads/test.csv')

