# Automatic Data eng with functions and models

In [14]:

import utils
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.sparse import hstack
import pandas as pd
from skrub import TableReport
from jours_feries_france import JoursFeries
from vacances_scolaires_france import SchoolHolidayDates
import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [4]:
data = pd.read_parquet("data/train.parquet")
data = data.sort_values(["date", "counter_name"])
data_test = pd.read_parquet("data/final_test.parquet")
data_test = data_test.sort_values(["date", "counter_name"])

In [5]:
external_conditions = pd.read_csv("data/external_data.csv")
external_conditions["date"] = pd.to_datetime(external_conditions["date"])

In [8]:
# Step 1: Sort the `external_conditions` DataFrame by the `date` column
external_conditions = external_conditions.sort_values(by="date")

# Drop columns with more than 40% NaN values
threshold = len(external_conditions) * 0.4
external_conditions = external_conditions.dropna(thresh=threshold, axis=1)

# Step 2: Remove duplicate entries based on the `date` column
external_conditions = external_conditions.drop_duplicates(subset="date")

# Step 3: Convert the 'date' column to datetime
external_conditions["date"] = pd.to_datetime(external_conditions["date"])

# Step 4: Create a complete date range from the minimum to the maximum date in the DataFrame
date_range = pd.date_range(
    start=external_conditions["date"].min(),
    end=external_conditions["date"].max(),
    freq="H",
)

# Step 5: Create a DataFrame from the date_range
date_range_df = pd.DataFrame(date_range, columns=["date"])

# Step 6: Merge the date_range DataFrame with the external_conditions DataFrame on the 'date' column
full_external_conditions = pd.merge(
    date_range_df, external_conditions, on="date", how="left"
)


# Fonction qui fait ce qu'on voulait faire avec ffill et bfill mais a la place prends la valeur la plus proche
def fill_closest_value_all_columns(df):
    """Fill NaN values with the closest value for all numeric columns in the DataFrame."""
    filled_df = df.copy()

    for column in filled_df.columns:
        if filled_df[column].dtype.kind in "biufc":  # Numeric columns
            non_nan_values = filled_df[column].dropna()

            def find_closest(value):
                if pd.isna(value):
                    closest_value = non_nan_values.iloc[
                        (non_nan_values - value).abs().argmin()
                    ]
                    return closest_value
                return value

            filled_df[column] = filled_df[column].apply(find_closest)

    return filled_df


# Apply the function to the DataFrame
filled_external_conditions = fill_closest_value_all_columns(full_external_conditions)

  date_range = pd.date_range(
  (non_nan_values - value).abs().argmin()


In [15]:
# Merge the DataFrames
merged_conditions = pd.merge(data, filled_external_conditions, on="date", how="left")

merged_conditions = utils._column_rename(merged_conditions)


merged_conditions_test = pd.merge(
    data_test, filled_external_conditions, on="date", how="left"
)

merged_conditions_test = utils._column_rename(merged_conditions_test)

In [16]:
# Ensure "date" is in datetime format
merged_conditions["date"] = pd.to_datetime(merged_conditions["date"], errors="coerce")

# Drop rows with invalid datetime entries
df = merged_conditions.dropna(subset=["date"])

# Extract date and time features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["weekday"] = df["date"].dt.dayofweek
df["day"] = df["date"].dt.day
df["hour"] = df["date"].dt.hour
df["is_weekend"] = (df["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates = df["date"].dt.date.unique()
d = SchoolHolidayDates()
f = JoursFeries()

try:
    dict_school_holidays = {
        date: d.is_holiday_for_zone(date, "C") for date in unique_dates
    }
    df["is_school_holiday"] = (
        df["date"].dt.date.map(dict_school_holidays).fillna(0).astype(int)
    )
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df["is_school_holiday"] = 0

try:
    dict_public_holidays = {
        date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates
    }
    df["is_public_holiday"] = (
        df["date"].dt.date.map(dict_public_holidays).fillna(0).astype(int)
    )
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df["is_public_holiday"] = 0

# Ensure "date" is in datetime format
merged_conditions_test["date"] = pd.to_datetime(
    merged_conditions_test["date"], errors="coerce"
)

# Drop rows with invalid datetime entries
df_test = merged_conditions_test.dropna(subset=["date"])

# Extract date and time features
df_test["year"] = df_test["date"].dt.year
df_test["month"] = df_test["date"].dt.month
df_test["weekday"] = df_test["date"].dt.dayofweek
df_test["day"] = df_test["date"].dt.day
df_test["hour"] = df_test["date"].dt.hour
df_test["is_weekend"] = (df_test["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates = df_test["date"].dt.date.unique()
d = SchoolHolidayDates()
f = JoursFeries()

try:
    dict_school_holidays = {
        date: d.is_holiday_for_zone(date, "C") for date in unique_dates
    }
    df_test["is_school_holiday"] = (
        df_test["date"].dt.date.map(dict_school_holidays).fillna(0).astype(int)
    )
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df_test["is_school_holiday"] = 0

try:
    dict_public_holidays = {
        date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates
    }
    df_test["is_public_holiday"] = (
        df_test["date"].dt.date.map(dict_public_holidays).fillna(0).astype(int)
    )
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df_test["is_public_holiday"] = 0

In [17]:
start_date_Monpar = "2021-01-25"
end_date_Monpar = "2021-02-23"
start_date_Clichy_NO_SE = "2021-04-09"
end_date_Clichy = "2021-07-20"
start_date_Clichy_SE_NO = "2021-03-23"
start_date_Pompidou = "2021-03-13"
end_date_Pompidou = "2021-04-01"

df["road_work_Monpar_O_E"] = np.where(
    (data["date"] >= start_date_Monpar)
    & (data["date"] <= end_date_Monpar)
    & (data["counter_name"] == "152 boulevard du Montparnasse O-E"),
    1,
    0,
)
df["road_work_Monpar_E_O"] = np.where(
    (df["date"] >= start_date_Monpar)
    & (df["date"] <= end_date_Monpar)
    & (df["counter_name"] == "152 boulevard du Montparnasse E-O"),
    1,
    0,
)
df["road_work_Clichy_NO_SE"] = np.where(
    (df["date"] >= start_date_Clichy_NO_SE)
    & (df["date"] <= end_date_Clichy)
    & (df["counter_name"] == "20 Avenue de Clichy NO-SE"),
    1,
    0,
)
df["road_work_Clichy_SE_NO"] = np.where(
    (df["date"] >= start_date_Clichy_SE_NO)
    & (df["date"] <= end_date_Clichy)
    & (df["counter_name"] == "20 Avenue de Clichy SE-NO"),
    1,
    0,
)
df["road_work_Pompidou_NE_SO"] = np.where(
    (df["date"] >= start_date_Pompidou)
    & (df["date"] <= end_date_Pompidou)
    & (df["counter_name"] == "Voie Georges Pompidou NE-SO"),
    1,
    0,
)
df["road_work_Pompidou_SO_NE"] = np.where(
    (df["date"] >= start_date_Pompidou)
    & (df["date"] <= end_date_Pompidou)
    & (df["counter_name"] == "Voie Georges Pompidou SO-NE"),
    1,
    0,
)

df["road_work"] = (
    df["road_work_Monpar_E_O"]
    + df["road_work_Monpar_O_E"]
    + df["road_work_Clichy_NO_SE"]
    + df["road_work_Clichy_SE_NO"]
    + df["road_work_Pompidou_NE_SO"]
    + df["road_work_Pompidou_SO_NE"]
)
df.drop(
    [
        "road_work_Monpar_E_O",
        "road_work_Monpar_O_E",
        "road_work_Clichy_NO_SE",
        "road_work_Clichy_SE_NO",
        "road_work_Pompidou_NE_SO",
        "road_work_Pompidou_SO_NE",
    ],
    axis=1,
    inplace=True,
)

df["log_bike_count"][
    (df["date"] >= start_date_Monpar)
    & (df["date"] <= end_date_Monpar)
    & (df["counter_name"] == "152 boulevard du Montparnasse E-O")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Monpar)
    & (df["date"] <= end_date_Monpar)
    & (df["counter_name"] == "152 boulevard du Montparnasse O-E")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Clichy_NO_SE)
    & (df["date"] <= end_date_Clichy)
    & (df["counter_name"] == "20 Avenue de Clichy NO-SE")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Clichy_SE_NO)
    & (df["date"] <= end_date_Clichy)
    & (df["counter_name"] == "20 Avenue de Clichy SE-NO")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Pompidou)
    & (df["date"] <= end_date_Pompidou)
    & (df["counter_name"] == "Voie Georges Pompidou NE-SO")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Pompidou)
    & (df["date"] <= end_date_Pompidou)
    & (df["counter_name"] == "Voie Georges Pompidou SO-NE")
] = 0

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["log_bike_count"][
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["log_bike_count"][
You are setting val

In [19]:
df.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,...,Layer_2_Cloud_Base_Height_(m),year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday,road_work
0,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,4.0,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,...,1200.0,2020,9,1,1,1,0,0,0,0
1,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,3.0,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,...,1200.0,2020,9,1,1,1,0,0,0,0
2,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,0.0,2020-09-01 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.85372,...,1200.0,2020,9,1,1,1,0,0,0,0
3,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,1.0,2020-09-01 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.85372,...,1200.0,2020,9,1,1,1,0,0,0,0
4,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,7.0,2020-09-01 01:00:00,2020-07-22,"48.88529,2.32666",Y2H20073268,48.88529,...,1200.0,2020,9,1,1,1,0,0,0,0


In [34]:
X_boruta = df.copy()

In [36]:
X_boruta.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,...,Layer_2_Cloud_Base_Height_(m),year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday,road_work
0,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,4.0,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,...,1200.0,2020,9,1,1,1,0,0,0,0
1,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,3.0,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,...,1200.0,2020,9,1,1,1,0,0,0,0
2,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,0.0,2020-09-01 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.85372,...,1200.0,2020,9,1,1,1,0,0,0,0
3,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,1.0,2020-09-01 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.85372,...,1200.0,2020,9,1,1,1,0,0,0,0
4,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,7.0,2020-09-01 01:00:00,2020-07-22,"48.88529,2.32666",Y2H20073268,48.88529,...,1200.0,2020,9,1,1,1,0,0,0,0


In [30]:
#TableReport(X_boruta)

In [37]:
# Check data types
non_numeric_columns = X_boruta.select_dtypes(exclude=["number"]).columns
print("Non-numeric columns:", non_numeric_columns)

Non-numeric columns: Index(['counter_id', 'counter_name', 'site_name', 'date',
       'counter_installation_date', 'coordinates', 'counter_technical_id'],
      dtype='object')


In [38]:
for col in non_numeric_columns:
    le = LabelEncoder()
    X_boruta[col] = le.fit_transform(X_boruta[col])

In [45]:
y = X_boruta["log_bike_count"]
X_boruta = X_boruta.drop(["log_bike_count", "bike_count"], axis=1)

KeyError: 'log_bike_count'

In [46]:
X_boruta.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,...,Layer_2_Cloud_Base_Height_(m),year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday,road_work
0,20,0,100049407,0,0,7,7,13,48.840801,2.333233,...,1200.0,2020,9,1,1,1,0,0,0,0
1,19,1,100049407,0,0,7,7,13,48.840801,2.333233,...,1200.0,2020,9,1,1,1,0,0,0,0
2,5,2,100036719,1,0,1,15,7,48.85372,2.35702,...,1200.0,2020,9,1,1,1,0,0,0,0
3,4,3,100036719,1,0,1,15,7,48.85372,2.35702,...,1200.0,2020,9,1,1,1,0,0,0,0
4,52,4,100063175,2,0,20,26,23,48.88529,2.32666,...,1200.0,2020,9,1,1,1,0,0,0,0


In [52]:
# BorutaPy accepts numpy arrays only, hence the .values attribute
X_boruta_values = X_boruta.values
y_values = y.values
y_values = y.ravel()

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators="auto", verbose=2, random_state=42)

  y_values = y.ravel()


In [53]:
# find all relevant features
feat_selector.fit(X_boruta_values, y_values)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	15
Tentative: 	1
Rejected: 	42
Iteration: 	9 / 100
Confirmed: 	15
Tentative: 	1
Rejected: 	42
Iteration: 	10 / 100
Confirmed: 	15
Tentative: 	1
Rejected: 	42
Iteration: 	11 / 100
Confirmed: 	15
Tentative: 	1
Rejected: 	42
Iteration: 	12 / 100
Confirmed: 	15
Tentative: 	1
Rejected: 	42
Iteration: 	13 / 100
Confirmed: 	15
Tentative: 	1
Rejected: 	42
Iteration: 	14 / 100
Confirmed: 	15
Tentative: 	1
Rejected: 	42
Iteration: 	15 / 100
Confirmed: 	15
Tentative: 	1
Rejected: 	42
Iteration: 	16 / 100
Confirmed: 	15
Tentative: 	1
Reject

In [54]:
# check selected features - first 5 features are selected
feat_selector.support_

array([ True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True, False,  True,
        True, False, False,  True])

In [55]:
feat_selector.ranking_

array([ 1,  1,  1,  1,  1,  2,  1,  1,  1,  1, 42, 17, 19, 11, 33, 27,  1,
       42, 17, 22, 38,  8, 10, 24, 30, 15,  8, 13, 32, 24, 22, 29, 38, 42,
       40, 11, 33, 28,  4, 35, 21, 13,  5, 38, 36, 15,  3,  6,  8, 42,  1,
        1, 19,  1,  1, 26, 30,  1])

In [63]:
X_boruta.columns

Index(['counter_id', 'counter_name', 'site_id', 'site_name', 'date',
       'counter_installation_date', 'coordinates', 'counter_technical_id',
       'latitude', 'longitude', 'Station_Number', 'Sea_Level_Pressure_(hPa)',
       'Pressure_Tendency_(hPa/3h)', 'Pressure_Tendency_Code',
       'Wind_Direction_(°)', 'Wind_Speed_(m/s)', 'Air_Temperature_(°C)',
       'Dew_Point_Temperature_(°C)', 'Relative_Humidity_(%)', 'Visibility_(m)',
       'Present_Weather_Code', 'Past_Weather_Code_1', 'Past_Weather_Code_2',
       'Total_Cloud_Cover_(oktas)', 'Cloud_Base_Height_(m)',
       'Lowest_Cloud_Base_Height_(m)', 'Low_Cloud_Type', 'Medium_Cloud_Type',
       'High_Cloud_Type', 'Station_Level_Pressure_(hPa)',
       '24h_Pressure_Tendency_(hPa)', '10min_Max_Wind_Gust_(m/s)',
       'Max_Wind_Gust_(m/s)', 'Measurement_Period_Duration', 'Ground_State',
       'Snow_Height_(cm)', 'New_Snow_Depth_(cm)',
       'New_Snowfall_Duration_(hours)', 'Rainfall_(1h,_mm)',
       'Rainfall_(3h,_mm)', 'Rain

In [56]:
X_boruta.columns[feat_selector.support_].tolist()

['counter_id',
 'counter_name',
 'site_id',
 'site_name',
 'date',
 'coordinates',
 'counter_technical_id',
 'latitude',
 'longitude',
 'Air_Temperature_(°C)',
 'month',
 'weekday',
 'hour',
 'is_weekend',
 'road_work']

In [None]:
def selecting_features_with_correlations(
    df: pd.DataFrame, features, target: str | None
) -> list:
    """
    Select feature based on correlations.
    Keep features that are correlated with the target.
    Remove features that are correlated with other features.

    Args:
        df (pd.DataFrame): Dataframe with features and target.
        features (List[str] | None): List of features.
        target (str | None): Target column.

    Returns:
        list: Selected features with correlations.
    """
    # Remove features that not correlated with the target
    correlation_with_label = (
        df[features].corr(method="spearman")[target].drop(target)
    )
    threshold_label = 0.05
    selected_features = correlation_with_label[
        correlation_with_label.abs() > threshold_label
    ].index
    correlation_matrix = df[selected_features].corr()
    threshold_features = 0.95
    to_drop = set()
    # Remove features that are correlated with other features
    # Keep the one that is more correlated with the target
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold_features:
                feature_to_drop = (
                    correlation_matrix.columns[i]
                    if abs(correlation_with_label[correlation_matrix.columns[i]])
                    < abs(correlation_with_label[correlation_matrix.columns[j]])
                    else correlation_matrix.columns[j]
                )
                to_drop.add(feature_to_drop)
    selected_features = [f for f in selected_features if f not in to_drop]
    return selected_features

In [112]:
X_corr = X_boruta.merge(y, left_index=True, right_index=True)
features = X_corr.columns
features = list(features)

In [113]:
corr_features = selecting_features_with_correlations(X_corr, features, "log_bike_count")

In [114]:
print(corr_features)

['counter_id', 'counter_name', 'site_id', 'counter_technical_id', 'latitude', 'Air_Temperature_(°C)', 'Dew_Point_Temperature_(°C)', 'Relative_Humidity_(%)', 'Lowest_Cloud_Base_Height_(m)', 'Low_Cloud_Type', '10min_Max_Wind_Gust_(m/s)', 'Ground_State', 'Layer_1_Cloud_Cover_(oktas)', 'Layer_1_Cloud_Type', 'Layer_2_Cloud_Cover_(oktas)', 'Layer_2_Cloud_Base_Height_(m)', 'weekday', 'hour', 'is_weekend', 'is_public_holiday', 'road_work']


In [115]:
def selecting_features_with_random_columns(df: pd.DataFrame, features, target):
    """
    Select features that have less importance than random ones.

    Args:
        df (pd.DataFrame): Dataframe with features and target.
        features (List[str]): List of features.
        target (str): Target column.

    Returns:
        List[str]: Selected features with random columns.
    """
    # Create 5 random columns
    for i in range(5):
        df[f"random_{i}"] = np.random.uniform(
            low=0,
            high=1,
            size=df.shape[0],
        )
    # Combine original features with random features
    all_features = features + [f"random_{i}" for i in range(5)]

    # Initialize and fit the RandomForestClassifier
    rf_classifier = RandomForestRegressor(
        n_estimators=100,
        random_state=42,
    )
    rf_classifier.fit(df[all_features].values, df[target].values)
    # Get feature importances
    feature_importances = rf_classifier.feature_importances_
    feature_importances_df = pd.DataFrame(
        {"feature": all_features, "importance": feature_importances}
    )
    # Select features with importance greater than the maximum random importance
    max_random_importance = feature_importances_df[
        feature_importances_df["feature"].str.contains("random")
    ]["importance"].max()
    selected_features = feature_importances_df[
        feature_importances_df["importance"] > max_random_importance
    ]["feature"].tolist()
    selected_features = [
        feature for feature in selected_features if not feature.startswith("random")
    ]
    return selected_features

In [116]:
X_rand = X_boruta.merge(y, left_index=True, right_index=True)
features = X_boruta.columns
features = list(features)

In [117]:
rand_features = selecting_features_with_random_columns(X_corr, features, "log_bike_count")

In [118]:
rand_features

['counter_id',
 'counter_name',
 'date',
 'coordinates',
 'counter_technical_id',
 'latitude',
 'longitude',
 'Air_Temperature_(°C)',
 'month',
 'weekday',
 'day',
 'hour',
 'is_weekend',
 'road_work']