In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import ydata_profiling
from skrub import TableReport
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from vacances_scolaires_france import SchoolHolidayDates
from datetime import date
from jours_feries_france import JoursFeries
import utils

In [2]:
data = pd.read_parquet("data/train.parquet")
# Sort by date first, so that time based cross-validation would produce correct results
data = data.sort_values(["date", "counter_name"])

data_test = pd.read_parquet("data/final_test.parquet")
# Sort by date first, so that time based cross-validation would produce correct results
data_test = data_test.sort_values(["date", "counter_name"])


In [5]:
external_conditions = pd.read_csv('data/external_data.csv')
external_conditions['date'] = pd.to_datetime(external_conditions['date'])

In [6]:
# Drop columns with more than 40% NaN values
threshold = len(external_conditions) * 0.4
external_conditions = external_conditions.dropna(thresh=threshold, axis=1)

# Step 1: Sort the `external_conditions` DataFrame by the `date` column
external_conditions = external_conditions.sort_values(by='date')

# Drop columns with more than 40% NaN values
threshold = len(external_conditions) * 0.4
external_conditions = external_conditions.dropna(thresh=threshold, axis=1)

# Step 2: Remove duplicate entries based on the `date` column
external_conditions = external_conditions.drop_duplicates(subset='date')

# Step 3: Convert the 'date' column to datetime
external_conditions['date'] = pd.to_datetime(external_conditions['date'])

# Step 4: Create a complete date range from the minimum to the maximum date in the DataFrame
date_range = pd.date_range(start=external_conditions['date'].min(), end=external_conditions['date'].max(), freq='H')

# Step 5: Create a DataFrame from the date_range
date_range_df = pd.DataFrame(date_range, columns=['date'])

# Step 6: Merge the date_range DataFrame with the external_conditions DataFrame on the 'date' column
full_external_conditions = pd.merge(date_range_df, external_conditions, on='date', how='left')

# Fonction qui fait ce qu'on voulait faire avec ffill et bfill mais a la place prends la valeur la plus proche
def fill_closest_value_all_columns(df):
    """Fill NaN values with the closest value for all numeric columns in the DataFrame."""
    filled_df = df.copy()
    
    for column in filled_df.columns:
        if filled_df[column].dtype.kind in 'biufc':  # Numeric columns
            non_nan_values = filled_df[column].dropna()
            
            def find_closest(value):
                if pd.isna(value):
                    closest_value = non_nan_values.iloc[(non_nan_values - value).abs().argmin()]
                    return closest_value
                return value
            
            filled_df[column] = filled_df[column].apply(find_closest)
    
    return filled_df

# Apply the function to the DataFrame
filled_external_conditions = fill_closest_value_all_columns(full_external_conditions)

  date_range = pd.date_range(start=external_conditions['date'].min(), end=external_conditions['date'].max(), freq='H')
  closest_value = non_nan_values.iloc[(non_nan_values - value).abs().argmin()]


In [7]:
# Merge the DataFrames
merged_conditions = pd.merge(data, filled_external_conditions, on='date', how='left')

merged_conditions = utils._column_rename(merged_conditions)


merged_conditions_test = pd.merge(data_test, filled_external_conditions, on='date', how='left')

merged_conditions_test = utils._column_rename(merged_conditions_test)

In [8]:
# Ensure "date" is in datetime format
merged_conditions["date"] = pd.to_datetime(merged_conditions["date"], errors="coerce")

# Drop rows with invalid datetime entries
df = merged_conditions.dropna(subset=["date"])

# Extract date and time features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["weekday"] = df["date"].dt.dayofweek
df["day"] = df["date"].dt.day
df["hour"] = df["date"].dt.hour
df["is_weekend"] = (df["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates = df["date"].dt.date.unique()
d = SchoolHolidayDates()
f = JoursFeries()

try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    df["is_school_holiday"] = df["date"].dt.date.map(dict_school_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df["is_school_holiday"] = 0

try:
    dict_public_holidays = {date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates}
    df["is_public_holiday"] = df["date"].dt.date.map(dict_public_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df["is_public_holiday"] = 0

# Ensure "date" is in datetime format
merged_conditions_test["date"] = pd.to_datetime(merged_conditions_test["date"], errors="coerce")

# Drop rows with invalid datetime entries
df_test = merged_conditions_test.dropna(subset=["date"])

# Extract date and time features
df_test["year"] = df_test["date"].dt.year
df_test["month"] = df_test["date"].dt.month
df_test["weekday"] = df_test["date"].dt.dayofweek
df_test["day"] = df_test["date"].dt.day
df_test["hour"] = df_test["date"].dt.hour
df_test["is_weekend"] = (df_test["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates = df_test["date"].dt.date.unique()
d = SchoolHolidayDates()
f = JoursFeries()

try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    df_test["is_school_holiday"] = df_test["date"].dt.date.map(dict_school_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df_test["is_school_holiday"] = 0

try:
    dict_public_holidays = {date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates}
    df_test["is_public_holiday"] = df_test["date"].dt.date.map(dict_public_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df_test["is_public_holiday"] = 0

In [9]:
TableReport(df)

Processing column  59 / 59


Unnamed: 0_level_0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count,Station Number,Sea Level Pressure (hPa),Pressure Tendency (hPa/3h),Pressure Tendency Code,Wind Direction (°),Wind Speed (m/s),Air Temperature (°C),Dew Point Temperature (°C),Relative Humidity (%),Visibility (m),Present Weather Code,Past Weather Code 1,Past Weather Code 2,Total Cloud Cover (oktas),Cloud Base Height (m),Lowest Cloud Base Height (m),Low Cloud Type,Medium Cloud Type,High Cloud Type,Station Level Pressure (hPa),24h Pressure Tendency (hPa),10min Max Wind Gust (m/s),Max Wind Gust (m/s),Measurement Period Duration,Ground State,Snow Height (cm),New Snow Depth (cm),New Snowfall Duration (hours),"Rainfall (1h, mm)","Rainfall (3h, mm)","Rainfall (6h, mm)","Rainfall (12h, mm)","Rainfall (24h, mm)",Layer 1 Cloud Cover (oktas),Layer 1 Cloud Type,Layer 1 Cloud Base Height (m),Layer 2 Cloud Cover (oktas),Layer 2 Cloud Type,Layer 2 Cloud Base Height (m),year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday
Unnamed: 0_level_1,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count,Station Number,Sea Level Pressure (hPa),Pressure Tendency (hPa/3h),Pressure Tendency Code,Wind Direction (°),Wind Speed (m/s),Air Temperature (°C),Dew Point Temperature (°C),Relative Humidity (%),Visibility (m),Present Weather Code,Past Weather Code 1,Past Weather Code 2,Total Cloud Cover (oktas),Cloud Base Height (m),Lowest Cloud Base Height (m),Low Cloud Type,Medium Cloud Type,High Cloud Type,Station Level Pressure (hPa),24h Pressure Tendency (hPa),10min Max Wind Gust (m/s),Max Wind Gust (m/s),Measurement Period Duration,Ground State,Snow Height (cm),New Snow Depth (cm),New Snowfall Duration (hours),"Rainfall (1h, mm)","Rainfall (3h, mm)","Rainfall (6h, mm)","Rainfall (12h, mm)","Rainfall (24h, mm)",Layer 1 Cloud Cover (oktas),Layer 1 Cloud Type,Layer 1 Cloud Base Height (m),Layer 2 Cloud Cover (oktas),Layer 2 Cloud Type,Layer 2 Cloud Base Height (m),year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday
0.0,100049407-353255860,152 boulevard du Montparnasse E-O,100049407.0,152 boulevard du Montparnasse,4.0,2020-09-01 01:00:00,2018-12-07 00:00:00,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,1.6094379124341005,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0
1.0,100049407-353255859,152 boulevard du Montparnasse O-E,100049407.0,152 boulevard du Montparnasse,3.0,2020-09-01 01:00:00,2018-12-07 00:00:00,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,1.3862943611198906,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0
2.0,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719.0,18 quai de l'Hôtel de Ville,0.0,2020-09-01 01:00:00,2017-07-12 00:00:00,"48.85372,2.35702",Y2H19027732,48.85372,2.35702,0.0,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0
3.0,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719.0,18 quai de l'Hôtel de Ville,1.0,2020-09-01 01:00:00,2017-07-12 00:00:00,"48.85372,2.35702",Y2H19027732,48.85372,2.35702,0.6931471805599453,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0
4.0,100063175-353277233,20 Avenue de Clichy NO-SE,100063175.0,20 Avenue de Clichy,7.0,2020-09-01 01:00:00,2020-07-22 00:00:00,"48.88529,2.32666",Y2H20073268,48.88529,2.32666,2.079441541679836,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
496822.0,100057329-103057329,Totem 85 quai d'Austerlitz SE-NO,100057329.0,Totem 85 quai d'Austerlitz,42.0,2021-09-09 23:00:00,2020-02-18 00:00:00,"48.84201,2.36729",YTH19111508,48.84201,2.36729,3.7612001156935615,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2021.0,9.0,3.0,9.0,23.0,0.0,0.0,0.0
496823.0,100057380-104057380,Totem Cours la Reine E-O,100057380.0,Totem Cours la Reine,22.0,2021-09-09 23:00:00,2020-02-11 00:00:00,"48.86462,2.31444",YTH19111509,48.86462,2.31444,3.1354942159291497,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2021.0,9.0,3.0,9.0,23.0,0.0,0.0,0.0
496824.0,100057380-103057380,Totem Cours la Reine O-E,100057380.0,Totem Cours la Reine,32.0,2021-09-09 23:00:00,2020-02-11 00:00:00,"48.86462,2.31444",YTH19111509,48.86462,2.31444,3.49650756146648,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2021.0,9.0,3.0,9.0,23.0,0.0,0.0,0.0
496825.0,100042374-110042374,Voie Georges Pompidou NE-SO,100042374.0,Voie Georges Pompidou,9.0,2021-09-09 23:00:00,2017-12-15 00:00:00,"48.8484,2.27586",Y2H21025335,48.8484,2.27586,2.302585092994046,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2021.0,9.0,3.0,9.0,23.0,0.0,0.0,0.0

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,counter_id,CategoricalDtype,0 (0.0%),56 (< 0.1%),,,,,
1,counter_name,CategoricalDtype,0 (0.0%),56 (< 0.1%),,,,,
2,site_id,Int64DType,0 (0.0%),,105000000.0,32100000.0,100007049,100056226.0,300014702
3,site_name,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
4,bike_count,Float64DType,0 (0.0%),,60.2,87.6,0.00,29.0,1.30e+03
5,date,DateTime64DType,0 (0.0%),,,,2020-09-01T01:00:00,,2021-09-09T23:00:00
6,counter_installation_date,DateTime64DType,0 (0.0%),,,,2013-01-18T00:00:00,,2020-11-29T00:00:00
7,coordinates,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
8,counter_technical_id,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
9,latitude,Float64DType,0 (0.0%),,48.9,0.0186,48.8,48.9,48.9

Column 1,Column 2,Cramér's V
coordinates,counter_technical_id,1.0
weekday,is_weekend,1.0
site_name,counter_technical_id,1.0
site_name,coordinates,1.0
Sea Level Pressure (hPa),Station Level Pressure (hPa),0.98
Lowest Cloud Base Height (m),Layer 1 Cloud Base Height (m),0.947
counter_id,counter_name,0.943
Ground State,Snow Height (cm),0.919
year,month,0.828
Ground State,New Snow Depth (cm),0.816


In [10]:
TableReport(df_test)

Processing column  57 / 57


Unnamed: 0_level_0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,Station Number,Sea Level Pressure (hPa),Pressure Tendency (hPa/3h),Pressure Tendency Code,Wind Direction (°),Wind Speed (m/s),Air Temperature (°C),Dew Point Temperature (°C),Relative Humidity (%),Visibility (m),Present Weather Code,Past Weather Code 1,Past Weather Code 2,Total Cloud Cover (oktas),Cloud Base Height (m),Lowest Cloud Base Height (m),Low Cloud Type,Medium Cloud Type,High Cloud Type,Station Level Pressure (hPa),24h Pressure Tendency (hPa),10min Max Wind Gust (m/s),Max Wind Gust (m/s),Measurement Period Duration,Ground State,Snow Height (cm),New Snow Depth (cm),New Snowfall Duration (hours),"Rainfall (1h, mm)","Rainfall (3h, mm)","Rainfall (6h, mm)","Rainfall (12h, mm)","Rainfall (24h, mm)",Layer 1 Cloud Cover (oktas),Layer 1 Cloud Type,Layer 1 Cloud Base Height (m),Layer 2 Cloud Cover (oktas),Layer 2 Cloud Type,Layer 2 Cloud Base Height (m),year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday
Unnamed: 0_level_1,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,Station Number,Sea Level Pressure (hPa),Pressure Tendency (hPa/3h),Pressure Tendency Code,Wind Direction (°),Wind Speed (m/s),Air Temperature (°C),Dew Point Temperature (°C),Relative Humidity (%),Visibility (m),Present Weather Code,Past Weather Code 1,Past Weather Code 2,Total Cloud Cover (oktas),Cloud Base Height (m),Lowest Cloud Base Height (m),Low Cloud Type,Medium Cloud Type,High Cloud Type,Station Level Pressure (hPa),24h Pressure Tendency (hPa),10min Max Wind Gust (m/s),Max Wind Gust (m/s),Measurement Period Duration,Ground State,Snow Height (cm),New Snow Depth (cm),New Snowfall Duration (hours),"Rainfall (1h, mm)","Rainfall (3h, mm)","Rainfall (6h, mm)","Rainfall (12h, mm)","Rainfall (24h, mm)",Layer 1 Cloud Cover (oktas),Layer 1 Cloud Type,Layer 1 Cloud Base Height (m),Layer 2 Cloud Cover (oktas),Layer 2 Cloud Type,Layer 2 Cloud Base Height (m),year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday
0.0,100049407-353255860,152 boulevard du Montparnasse E-O,100049407.0,152 boulevard du Montparnasse,2021-09-10 01:00:00,2018-12-07 00:00:00,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0
1.0,100049407-353255859,152 boulevard du Montparnasse O-E,100049407.0,152 boulevard du Montparnasse,2021-09-10 01:00:00,2018-12-07 00:00:00,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0
2.0,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719.0,18 quai de l'Hôtel de Ville,2021-09-10 01:00:00,2017-07-12 00:00:00,"48.85372,2.35702",Y2H19027732,48.85372,2.35702,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0
3.0,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719.0,18 quai de l'Hôtel de Ville,2021-09-10 01:00:00,2017-07-12 00:00:00,"48.85372,2.35702",Y2H19027732,48.85372,2.35702,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0
4.0,100063175-353277233,20 Avenue de Clichy NO-SE,100063175.0,20 Avenue de Clichy,2021-09-10 01:00:00,2020-07-22 00:00:00,"48.88529,2.32666",Y2H20073268,48.88529,2.32666,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
51435.0,100057329-103057329,Totem 85 quai d'Austerlitz SE-NO,100057329.0,Totem 85 quai d'Austerlitz,2021-10-18 21:00:00,2020-02-18 00:00:00,"48.84201,2.36729",YTH19111508,48.84201,2.36729,7149.0,102160.0,0.0,0.0,170.0,3.6,288.35,287.55,95.0,10000.0,61.0,6.0,6.0,100.0,7.0,450.0,38.0,23.0,11.0,101080.0,60.0,5.6,5.6,-10.0,2.0,0.0,0.0,-30.0,0.8,1.8,1.8,1.8,1.8,1.0,8.0,480.0,7.0,6.0,1440.0,2021.0,10.0,0.0,18.0,21.0,0.0,0.0,0.0
51436.0,100057380-104057380,Totem Cours la Reine E-O,100057380.0,Totem Cours la Reine,2021-10-18 21:00:00,2020-02-11 00:00:00,"48.86462,2.31444",YTH19111509,48.86462,2.31444,7149.0,102160.0,0.0,0.0,170.0,3.6,288.35,287.55,95.0,10000.0,61.0,6.0,6.0,100.0,7.0,450.0,38.0,23.0,11.0,101080.0,60.0,5.6,5.6,-10.0,2.0,0.0,0.0,-30.0,0.8,1.8,1.8,1.8,1.8,1.0,8.0,480.0,7.0,6.0,1440.0,2021.0,10.0,0.0,18.0,21.0,0.0,0.0,0.0
51437.0,100057380-103057380,Totem Cours la Reine O-E,100057380.0,Totem Cours la Reine,2021-10-18 21:00:00,2020-02-11 00:00:00,"48.86462,2.31444",YTH19111509,48.86462,2.31444,7149.0,102160.0,0.0,0.0,170.0,3.6,288.35,287.55,95.0,10000.0,61.0,6.0,6.0,100.0,7.0,450.0,38.0,23.0,11.0,101080.0,60.0,5.6,5.6,-10.0,2.0,0.0,0.0,-30.0,0.8,1.8,1.8,1.8,1.8,1.0,8.0,480.0,7.0,6.0,1440.0,2021.0,10.0,0.0,18.0,21.0,0.0,0.0,0.0
51438.0,100042374-110042374,Voie Georges Pompidou NE-SO,100042374.0,Voie Georges Pompidou,2021-10-18 21:00:00,2017-12-15 00:00:00,"48.8484,2.27586",Y2H21025335,48.8484,2.27586,7149.0,102160.0,0.0,0.0,170.0,3.6,288.35,287.55,95.0,10000.0,61.0,6.0,6.0,100.0,7.0,450.0,38.0,23.0,11.0,101080.0,60.0,5.6,5.6,-10.0,2.0,0.0,0.0,-30.0,0.8,1.8,1.8,1.8,1.8,1.0,8.0,480.0,7.0,6.0,1440.0,2021.0,10.0,0.0,18.0,21.0,0.0,0.0,0.0

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,counter_id,CategoricalDtype,0 (0.0%),56 (0.1%),,,,,
1,counter_name,CategoricalDtype,0 (0.0%),56 (0.1%),,,,,
2,site_id,Int64DType,0 (0.0%),,107000000.0,37400000.0,100007049,100056327.0,300014702
3,site_name,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
4,date,DateTime64DType,0 (0.0%),,,,2021-09-10T01:00:00,,2021-10-18T21:00:00
5,counter_installation_date,DateTime64DType,0 (0.0%),,,,2013-01-18T00:00:00,,2020-11-29T00:00:00
6,coordinates,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
7,counter_technical_id,CategoricalDtype,0 (0.0%),30 (< 0.1%),,,,,
8,latitude,Float64DType,0 (0.0%),,48.9,0.0186,48.8,48.9,48.9
9,longitude,Float64DType,0 (0.0%),,2.34,0.0383,2.27,2.35,2.41

Column 1,Column 2,Cramér's V
counter_id,counter_name,1.0
site_name,counter_technical_id,1.0
site_name,coordinates,1.0
coordinates,counter_technical_id,1.0
weekday,is_weekend,1.0
Sea Level Pressure (hPa),Station Level Pressure (hPa),0.992
Past Weather Code 1,Past Weather Code 2,0.931
Lowest Cloud Base Height (m),Layer 1 Cloud Base Height (m),0.929
Ground State,"Rainfall (3h, mm)",0.789
Ground State,"Rainfall (24h, mm)",0.765


Decide to remove site id, site name and counter id to just keep counter name to reduce complexity and the data as they all provide more or less the same information. Counter is more precise as we will be able to calculate the number of times a counter is used in a given site.

## Model training with Elastic Net (To find the best features)

Elastic net can handle multicolinearity and shrinks the less important features to zero. It is a combination of L1 and L2 regularization. It is a linear regression model trained with L1 and L2 prior as regularizer. This combination allows for learning a sparse model where few of the weights are non-zero like Lasso, while still maintaining the regularization properties of Ridge.

In [None]:
# Define the features and target variable
X = merged_data.drop(columns=[
                            'bike_count', 'log_bike_count',
                            'counter_id', 'site_id', 'site_name', 'counter_technical_id',
                            'coordinates',
                            'Station Number', 'Measurement Period Duration',
                            'date', 'Date and Time', 'counter_installation_date',
                    ])



y = merged_data['log_bike_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer with OneHotEncoder for 'counter_name' and SimpleImputer for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['counter_name'])
    ])

# Create a pipeline with the preprocessor, standard scaler, and ElasticNet regression
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features

elasticnet_feature_importance = pd.Series(elasticnet_coefficients, index=feature_names).sort_values(ascending=False)

In [None]:
# Print the feature importances
print(elasticnet_feature_importance)

In [None]:
# Filter features with non-zero importance
non_zero_features = elasticnet_feature_importance[elasticnet_feature_importance != 0].index.tolist()
# Keep only the non-zero features in the merged dataset
# Keep all the variables apart from the columns which are derived from a one hot encoder
non_zero_features = [feature for feature in non_zero_features if not feature.startswith('counter_name_')]
merged_data_filtered = merged_data[['counter_name', 'bike_count', 'log_bike_count'] + non_zero_features]
test_merged_data_filtered = test_merged_data[['counter_name'] + non_zero_features]


# Display the new dataframe
merged_data_filtered

In [None]:
# Subtract 273 from all values in the "Air Temperature (°C)" column
merged_data_filtered.loc[:,'Air Temperature (°C)'] -= 273
test_merged_data_filtered.loc[:,'Air Temperature (°C)'] -= 273
merged_data_filtered
test_merged_data_filtered

In [None]:
from xgboost import XGBRegressor

# Define the features and target variable
X = merged_data_filtered.drop(columns=[
                            'bike_count', 'log_bike_count',
                    ])

y = merged_data_filtered['log_bike_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer with OneHotEncoder for 'counter_name' and SimpleImputer for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['counter_name'])
    ])

# Create a pipeline with the preprocessor, standard scaler (with_mean=False), and XGBRegressor
xgboostpipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),
    ('regressor', XGBRegressor())
])

# Fit the XGBRegressor pipeline on the training data
xgboostpipeline.fit(X_train, y_train)

# Print the score of the XGBRegressor model on the test data
print(f"XGBRegressor model score: {xgboostpipeline.score(X_test, y_test)}")

# Output information about the XGBRegressor model
xgboost_feature_importances = xgboostpipeline.named_steps['regressor'].feature_importances_

# Get feature names after preprocessing
feature_names = (xgboostpipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 xgboostpipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features


In [None]:
# Predict the log_bike_count for the test_merged_data_filtered dataframe
y_pred = xgboostpipeline.predict(test_merged_data_filtered)

# Display the dataframe with predictions
y_pred

In [None]:
submission = pd.DataFrame({
    'log_bike_count': y_pred
}).reset_index(drop=True)
submission.index.name = 'Id'

submission.to_csv('/Users/felix/Downloads/test.csv')

