In [8]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import ydata_profiling
from skrub import TableReport
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from vacances_scolaires_france import SchoolHolidayDates
from datetime import date
from jours_feries_france import JoursFeries

In [9]:
data = pd.read_parquet(Path("data") / "train.parquet")
data.shape

(496827, 12)

In [10]:
external_conditions = pd.read_csv('data/external_data.csv')
external_conditions

Unnamed: 0,numer_sta,date,pmer,tend,cod_tend,dd,ff,t,td,u,...,hnuage1,nnuage2,ctype2,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4
0,7149,2021-01-01 00:00:00,100810,80,1,270,1.8,272.75,272.15,96,...,600.0,,,,,,,,,
1,7149,2021-01-01 03:00:00,100920,110,3,300,1.7,271.25,270.95,98,...,1500.0,2.0,3.0,3000.0,,,,,,
2,7149,2021-01-01 06:00:00,100950,30,3,290,2.6,271.95,271.65,98,...,480.0,4.0,6.0,2000.0,6.0,3.0,3000.0,,,
3,7149,2021-01-01 09:00:00,101100,150,2,280,1.7,272.45,272.05,97,...,1740.0,3.0,3.0,2800.0,,,,,,
4,7149,2021-01-01 12:00:00,101110,30,0,50,1.0,276.95,274.15,82,...,330.0,4.0,6.0,570.0,7.0,6.0,810.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,7149,2020-09-30 09:00:00,101540,-30,8,230,4.4,289.95,286.85,82,...,400.0,7.0,6.0,2200.0,,,,,,
3318,7149,2020-09-30 12:00:00,101320,-210,8,190,4.9,292.05,285.55,66,...,870.0,7.0,6.0,1900.0,,,,,,
3319,7149,2020-09-30 15:00:00,101140,-180,7,190,4.1,291.55,286.45,72,...,820.0,7.0,6.0,2200.0,,,,,,
3320,7149,2020-09-30 18:00:00,101020,-130,6,190,2.7,290.15,285.25,73,...,2160.0,,,,,,,,,


In [11]:
# Step 1: Sort the `external_conditions` DataFrame by the `date` column
external_conditions = external_conditions.sort_values(by='date')

# Step 2: Remove duplicate entries based on the `date` column
external_conditions = external_conditions.drop_duplicates(subset='date')

# Step 3: Convert the 'date' column to datetime
external_conditions['date'] = pd.to_datetime(external_conditions['date'])

# Step 4: Create a complete date range from the minimum to the maximum date in the DataFrame
date_range = pd.date_range(start=external_conditions['date'].min(), end=external_conditions['date'].max(), freq='H')

# Step 5: Create a DataFrame from the date_range
date_range_df = pd.DataFrame(date_range, columns=['date'])

# Step 6: Merge the date_range DataFrame with the external_conditions DataFrame on the 'date' column
full_external_conditions = pd.merge(date_range_df, external_conditions, on='date', how='left')

# Fonction qui fait ce qu'on voulait faire avec ffill et bfill mais a la place prends la valeur la plus proche
def fill_closest_value_all_columns(df):
    """Fill NaN values with the closest value for all numeric columns in the DataFrame."""
    filled_df = df.copy()
    
    for column in filled_df.columns:
        if filled_df[column].dtype.kind in 'biufc':  # Numeric columns
            non_nan_values = filled_df[column].dropna()
            
            def find_closest(value):
                if pd.isna(value):
                    closest_value = non_nan_values.iloc[(non_nan_values - value).abs().argmin()]
                    return closest_value
                return value
            
            filled_df[column] = filled_df[column].apply(find_closest)
    
    return filled_df

# Apply the function to the DataFrame
filled_external_conditions = fill_closest_value_all_columns(full_external_conditions)


  date_range = pd.date_range(start=external_conditions['date'].min(), end=external_conditions['date'].max(), freq='H')
  closest_value = non_nan_values.iloc[(non_nan_values - value).abs().argmin()]


ValueError: attempt to get argmin of an empty sequence

In [None]:
TableReport(filled_external_conditions)

Processing column  49 / 49


Unnamed: 0_level_0,date,numer_sta,pmer,tend,cod_tend,dd,ff,t,td,u,vv,ww,w1,w2,n,nbas,hbas,cl,cm,ch,pres,tend24,tn12,tx12,tminsol,raf10,rafper,per,etat_sol,ht_neige,ssfrai,perssfrai,rr1,rr3,rr6,rr12,rr24,nnuage1,ctype1,hnuage1,nnuage2,ctype2,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4
Unnamed: 0_level_1,date,numer_sta,pmer,tend,cod_tend,dd,ff,t,td,u,vv,ww,w1,w2,n,nbas,hbas,cl,cm,ch,pres,tend24,tn12,tx12,tminsol,raf10,rafper,per,etat_sol,ht_neige,ssfrai,perssfrai,rr1,rr3,rr6,rr12,rr24,nnuage1,ctype1,hnuage1,nnuage2,ctype2,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4
0.0,2020-09-01 00:00:00,7149.0,102050.0,-10.0,8.0,340.0,1.6,285.75,282.55,81.0,30000.0,1.0,0.0,0.0,0.0,0.0,800.0,30.0,20.0,10.0,100960.0,120.0,283.65,291.45,284.15,2.4,3.1,-10.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,3.0,8.0,810.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0
1.0,2020-09-01 01:00:00,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,283.65,291.45,284.15,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0
2.0,2020-09-01 02:00:00,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,283.65,291.45,284.15,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0
3.0,2020-09-01 03:00:00,7149.0,101990.0,-60.0,6.0,290.0,1.1,283.95,282.05,88.0,25000.0,2.0,0.0,0.0,0.0,0.0,800.0,30.0,20.0,10.0,100900.0,0.0,283.65,291.45,284.15,1.5,1.5,-10.0,0.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,0.0,3.0,8.0,810.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0
4.0,2020-09-01 04:00:00,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,283.65,291.45,284.15,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9968.0,2021-10-21 08:00:00,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,283.65,291.45,284.15,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0
9969.0,2021-10-21 09:00:00,7149.0,101230.0,230.0,1.0,240.0,7.0,286.05,280.75,70.0,30000.0,1.0,0.0,0.0,25.0,1.0,800.0,31.0,23.0,11.0,100150.0,70.0,283.65,291.45,284.15,11.1,12.5,-10.0,0.0,0.0,0.0,-30.0,0.0,0.0,-0.1,2.0,2.0,1.0,8.0,700.0,1.0,3.0,4000.0,1.0,0.0,8000.0,4.0,9.0,7800.0
9970.0,2021-10-21 10:00:00,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,283.65,291.45,284.15,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0
9971.0,2021-10-21 11:00:00,7149.0,101160.0,-60.0,8.0,240.0,4.6,286.25,282.65,79.0,7000.0,61.0,6.0,6.0,90.0,7.0,800.0,38.0,23.0,11.0,100090.0,160.0,283.65,291.45,284.15,7.9,13.2,-10.0,1.0,0.0,0.0,-60.0,0.6,0.6,0.6,0.6,2.6,3.0,8.0,810.0,6.0,6.0,1200.0,1.0,0.0,8000.0,4.0,9.0,7800.0

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,date,DateTime64DType,0 (0.0%),,,,2020-09-01T00:00:00,,2021-10-21T12:00:00
1,numer_sta,Float64DType,0 (0.0%),,7150.0,0.0,,,
2,pmer,Float64DType,0 (0.0%),,101000.0,587.0,9.73e+04,101000.0,1.04e+05
3,tend,Float64DType,0 (0.0%),,-40.1,75.9,-750.,-60.0,720.
4,cod_tend,Float64DType,0 (0.0%),,6.76,2.35,0.00,8.0,8.00
5,dd,Float64DType,0 (0.0%),,221.0,65.3,0.00,240.0,360.
6,ff,Float64DType,0 (0.0%),,4.29,1.24,0.00,4.6,14.6
7,t,Float64DType,0 (0.0%),,286.0,3.96,268.,286.0,307.
8,td,Float64DType,0 (0.0%),,282.0,3.34,261.,283.0,293.
9,u,Float64DType,0 (0.0%),,77.6,10.0,24.0,79.0,100.

Column 1,Column 2,Cramér's V
pmer,pres,0.97
nnuage4,hnuage4,0.876
hbas,hnuage1,0.821
ctype4,hnuage4,0.75
etat_sol,ssfrai,0.75
w1,w2,0.747
ff,raf10,0.71
ht_neige,ssfrai,0.707
cl,ctype1,0.651
ww,w1,0.648


In [None]:
test_data = pd.read_parquet(Path("data") / "final_test.parquet")
test_data

In [None]:
# Dictionary mapping column names to their practical explanations
column_name_mapping = {
    'numer_sta': 'Station Number',
    'date': 'Date and Time',
    'pmer': 'Sea Level Pressure (hPa)',
    'tend': 'Pressure Tendency (hPa/3h)',
    'cod_tend': 'Pressure Tendency Code',
    'dd': 'Wind Direction (°)',
    'ff': 'Wind Speed (m/s)',
    't': 'Air Temperature (°C)',
    'td': 'Dew Point Temperature (°C)',
    'u': 'Relative Humidity (%)',
    'vv': 'Visibility (m)',
    'ww': 'Present Weather Code',
    'w1': 'Past Weather Code 1',
    'w2': 'Past Weather Code 2',
    'n': 'Total Cloud Cover (oktas)',
    'nbas': 'Cloud Base Height (m)',
    'hbas': 'Lowest Cloud Base Height (m)',
    'cl': 'Low Cloud Type',
    'cm': 'Medium Cloud Type',
    'ch': 'High Cloud Type',
    'pres': 'Station Level Pressure (hPa)',
    'niv_bar': 'Barometer Altitude (m)',
    'geop': 'Geopotential Height (m)',
    'tend24': '24h Pressure Tendency (hPa)',
    'tn12': '12h Minimum Temperature (°C)',
    'tn24': '24h Minimum Temperature (°C)',
    'tx12': '12h Maximum Temperature (°C)',
    'tx24': '24h Maximum Temperature (°C)',
    'tminsol': 'Minimum Soil Temperature (°C)',
    'sw': 'Sunshine Duration (hours)',
    'tw': 'Wet Bulb Temperature (°C)',
    'raf10': '10min Max Wind Gust (m/s)',
    'rafper': 'Max Wind Gust (m/s)',
    'per': 'Measurement Period Duration',
    'etat_sol': 'Ground State',
    'ht_neige': 'Snow Height (cm)',
    'ssfrai': 'New Snow Depth (cm)',
    'perssfrai': 'New Snowfall Duration (hours)',
    'rr1': 'Rainfall (1h, mm)',
    'rr3': 'Rainfall (3h, mm)',
    'rr6': 'Rainfall (6h, mm)',
    'rr12': 'Rainfall (12h, mm)',
    'rr24': 'Rainfall (24h, mm)',
    'phenspe1': 'Special Weather Phenomenon 1',
    'phenspe2': 'Special Weather Phenomenon 2',
    'phenspe3': 'Special Weather Phenomenon 3',
    'phenspe4': 'Special Weather Phenomenon 4',
    'nnuage1': 'Layer 1 Cloud Cover (oktas)',
    'ctype1': 'Layer 1 Cloud Type',
    'hnuage1': 'Layer 1 Cloud Base Height (m)',
    'nnuage2': 'Layer 2 Cloud Cover (oktas)',
    'ctype2': 'Layer 2 Cloud Type',
    'hnuage2': 'Layer 2 Cloud Base Height (m)',
    'nnuage3': 'Layer 3 Cloud Cover (oktas)',
    'ctype3': 'Layer 3 Cloud Type',
    'hnuage3': 'Layer 3 Cloud Base Height (m)',
    'nnuage4': 'Layer 4 Cloud Cover (oktas)',
    'ctype4': 'Layer 4 Cloud Type',
    'hnuage4': 'Layer 4 Cloud Base Height (m)',
}

# Rename columns in the DataFrame
external_conditions = external_conditions.rename(columns=column_name_mapping)


In [None]:
threshold = len(external_conditions) * 0.8
external_conditions = external_conditions.loc[:, external_conditions.isnull().sum() < threshold]
TableReport(external_conditions)

In [None]:
TableReport(data)

In [None]:
TableReport(test_data)

In [None]:
# Convert 'Date and Time' column in external_conditions to datetime
external_conditions['Date and Time'] = pd.to_datetime(external_conditions['Date and Time'])

# Merge the dataframes
merged_data = pd.merge(data, external_conditions, left_on='date', right_on='Date and Time', how='left')
test_merged_data = pd.merge(test_data, external_conditions, left_on='date', right_on='Date and Time', how='left')

# Display the merged dataframe
test_merged_data

In [None]:
d = SchoolHolidayDates()

In [None]:
# Ensure "Date and Time" is in datetime format
merged_data["Date and Time"] = pd.to_datetime(merged_data["Date and Time"], errors="coerce")

# Check for missing or invalid datetime entries
if merged_data["Date and Time"].isnull().any():
    print("Warning: Missing or invalid datetime entries found.")
    # Handle missing values if needed
    merged_data = merged_data.dropna(subset=["Date and Time"])

# Extract date and time features
merged_data["measurement_date"] = merged_data["Date and Time"].dt.date
merged_data["measurement_year"] = merged_data["Date and Time"].dt.year
merged_data["measurement_month"] = merged_data["Date and Time"].dt.month
merged_data["measurement_day_of_week"] = merged_data["Date and Time"].dt.dayofweek
merged_data["measurement_day"] = merged_data["Date and Time"].dt.day
merged_data["measurement_hour"] = merged_data["Date and Time"].dt.hour

# Determine if the day is a weekend
merged_data["measurement_is_weekend"] = np.where(
    merged_data["measurement_day_of_week"] >= 5, 1, 0
)

# Handle school holidays
unique_dates = merged_data["measurement_date"].unique()

# Example holiday mapping function
d = JoursFeries()
try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    merged_data["is_school_holiday"] = merged_data["measurement_date"].map(
        dict_school_holidays
    )
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    merged_data["is_school_holiday"] = 0  # Fallback to default value

# Handle public holidays
f = JoursFeries()
try:
    dict_public_holidays = {
        date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates
    }
    merged_data["is_public_holiday"] = merged_data["measurement_date"].map(
        dict_public_holidays
    )
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    merged_data["is_public_holiday"] = 0  # Fallback to default value

# Extract additional date and time features for the counter
merged_data["counter_year"] = merged_data["Date and Time"].dt.year
merged_data["counter_month"] = merged_data["Date and Time"].dt.month
merged_data["counter_day"] = merged_data["Date and Time"].dt.day
merged_data["counter_hour"] = merged_data["Date and Time"].dt.hour

# Check the first rows for a specific date
print(merged_data[merged_data["measurement_date"].astype("str") == "2021-01-01"].head())



In [None]:
# Ensure "Date and Time" is in datetime format
test_merged_data["Date and Time"] = pd.to_datetime(test_merged_data["Date and Time"], errors="coerce")

# Extract date and time features
test_merged_data["measurement_date"] = test_merged_data["Date and Time"].dt.date
test_merged_data["measurement_year"] = test_merged_data["Date and Time"].dt.year
test_merged_data["measurement_month"] = test_merged_data["Date and Time"].dt.month
test_merged_data["measurement_day_of_week"] = test_merged_data["Date and Time"].dt.dayofweek
test_merged_data["measurement_day"] = test_merged_data["Date and Time"].dt.day
test_merged_data["measurement_hour"] = test_merged_data["Date and Time"].dt.hour

# Determine if the day is a weekend
test_merged_data["measurement_is_weekend"] = np.where(
    test_merged_data["measurement_day_of_week"] >= 5, 1, 0
)

# Handle school holidays
unique_dates = test_merged_data["measurement_date"].unique()

# Example holiday mapping function
d = SchoolHolidayDates()
try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    test_merged_data["is_school_holiday"] = test_merged_data["measurement_date"].map(
        dict_school_holidays
    )
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    test_merged_data["is_school_holiday"] = 0  # Fallback to default value

# Handle public holidays
f = JoursFeries()
try:
    dict_public_holidays = {
        date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates
    }
    test_merged_data["is_public_holiday"] = test_merged_data["measurement_date"].map(
        dict_public_holidays
    )
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    test_merged_data["is_public_holiday"] = 0  # Fallback to default value

# Extract additional date and time features for the counter
test_merged_data["counter_year"] = test_merged_data["Date and Time"].dt.year
test_merged_data["counter_month"] = test_merged_data["Date and Time"].dt.month
test_merged_data["counter_day"] = test_merged_data["Date and Time"].dt.day
test_merged_data["counter_hour"] = test_merged_data["Date and Time"].dt.hour

# Check the first rows for a specific date (if needed)
print(test_merged_data[test_merged_data["measurement_date"].astype("str") == "2021-01-01"].head())


In [None]:
TableReport(test_merged_data)

Decide to remove site id, site name and counter id to just keep counter name to reduce complexity and the data as they all provide more or less the same information. Counter is more precise as we will be able to calculate the number of times a counter is used in a given site.

## Model training with Elastic Net (To find the best features)

Elastic net can handle multicolinearity and shrinks the less important features to zero. It is a combination of L1 and L2 regularization. It is a linear regression model trained with L1 and L2 prior as regularizer. This combination allows for learning a sparse model where few of the weights are non-zero like Lasso, while still maintaining the regularization properties of Ridge.

In [None]:
# Define the features and target variable
X = merged_data.drop(columns=[
                            'bike_count', 'log_bike_count',
                            'counter_id', 'site_id', 'site_name', 'counter_technical_id',
                            'coordinates',
                            'Station Number', 'Measurement Period Duration',
                            'date', 'Date and Time', 'counter_installation_date',
                    ])



y = merged_data['log_bike_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer with OneHotEncoder for 'counter_name' and SimpleImputer for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['counter_name'])
    ])

# Create a pipeline with the preprocessor, standard scaler, and ElasticNet regression
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features

elasticnet_feature_importance = pd.Series(elasticnet_coefficients, index=feature_names).sort_values(ascending=False)

In [None]:
# Print the feature importances
print(elasticnet_feature_importance)

In [None]:
# Filter features with non-zero importance
non_zero_features = elasticnet_feature_importance[elasticnet_feature_importance != 0].index.tolist()
# Keep only the non-zero features in the merged dataset
# Keep all the variables apart from the columns which are derived from a one hot encoder
non_zero_features = [feature for feature in non_zero_features if not feature.startswith('counter_name_')]
merged_data_filtered = merged_data[['counter_name', 'bike_count', 'log_bike_count'] + non_zero_features]
test_merged_data_filtered = test_merged_data[['counter_name'] + non_zero_features]


# Display the new dataframe
merged_data_filtered

In [None]:
# Subtract 273 from all values in the "Air Temperature (°C)" column
merged_data_filtered.loc[:,'Air Temperature (°C)'] -= 273
test_merged_data_filtered.loc[:,'Air Temperature (°C)'] -= 273
merged_data_filtered
test_merged_data_filtered

In [None]:
from xgboost import XGBRegressor

# Define the features and target variable
X = merged_data_filtered.drop(columns=[
                            'bike_count', 'log_bike_count',
                    ])

y = merged_data_filtered['log_bike_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer with OneHotEncoder for 'counter_name' and SimpleImputer for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['counter_name'])
    ])

# Create a pipeline with the preprocessor, standard scaler (with_mean=False), and XGBRegressor
xgboostpipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),
    ('regressor', XGBRegressor())
])

# Fit the XGBRegressor pipeline on the training data
xgboostpipeline.fit(X_train, y_train)

# Print the score of the XGBRegressor model on the test data
print(f"XGBRegressor model score: {xgboostpipeline.score(X_test, y_test)}")

# Output information about the XGBRegressor model
xgboost_feature_importances = xgboostpipeline.named_steps['regressor'].feature_importances_

# Get feature names after preprocessing
feature_names = (xgboostpipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 xgboostpipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features


In [None]:
# Predict the log_bike_count for the test_merged_data_filtered dataframe
y_pred = xgboostpipeline.predict(test_merged_data_filtered)

# Display the dataframe with predictions
y_pred

In [None]:
submission = pd.DataFrame({
    'log_bike_count': y_pred
}).reset_index(drop=True)
submission.index.name = 'Id'

submission.to_csv('/Users/felix/Downloads/test.csv')

