In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from skrub import TableReport
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from vacances_scolaires_france import SchoolHolidayDates
from datetime import date
from jours_feries_france import JoursFeries
import utils

In [2]:
data = pd.read_parquet("data/train.parquet")
# Sort by date first, so that time based cross-validation would produce correct results
data = data.sort_values(["date", "counter_name"])

data_test = pd.read_parquet("data/final_test.parquet")
# Sort by date first, so that time based cross-validation would produce correct results
data_test = data_test.sort_values(["date", "counter_name"])


In [3]:
external_conditions = pd.read_csv('data/external_data.csv')
external_conditions['date'] = pd.to_datetime(external_conditions['date'])

In [4]:
# Drop columns with more than 40% NaN values
threshold = len(external_conditions) * 0.4
external_conditions = external_conditions.dropna(thresh=threshold, axis=1)

# Step 1: Sort the `external_conditions` DataFrame by the `date` column
external_conditions = external_conditions.sort_values(by='date')

# Drop columns with more than 40% NaN values
threshold = len(external_conditions) * 0.4
external_conditions = external_conditions.dropna(thresh=threshold, axis=1)

# Step 2: Remove duplicate entries based on the `date` column
external_conditions = external_conditions.drop_duplicates(subset='date')

# Step 3: Convert the 'date' column to datetime
external_conditions['date'] = pd.to_datetime(external_conditions['date'])

# Step 4: Create a complete date range from the minimum to the maximum date in the DataFrame
date_range = pd.date_range(start=external_conditions['date'].min(), end=external_conditions['date'].max(), freq='H')

# Step 5: Create a DataFrame from the date_range
date_range_df = pd.DataFrame(date_range, columns=['date'])

# Step 6: Merge the date_range DataFrame with the external_conditions DataFrame on the 'date' column
full_external_conditions = pd.merge(date_range_df, external_conditions, on='date', how='left')

# Fonction qui fait ce qu'on voulait faire avec ffill et bfill mais a la place prends la valeur la plus proche
def fill_closest_value_all_columns(df):
    """Fill NaN values with the closest value for all numeric columns in the DataFrame."""
    filled_df = df.copy()
    
    for column in filled_df.columns:
        if filled_df[column].dtype.kind in 'biufc':  # Numeric columns
            non_nan_values = filled_df[column].dropna()
            
            def find_closest(value):
                if pd.isna(value):
                    closest_value = non_nan_values.iloc[(non_nan_values - value).abs().argmin()]
                    return closest_value
                return value
            
            filled_df[column] = filled_df[column].apply(find_closest)
    
    return filled_df

# Apply the function to the DataFrame
filled_external_conditions = fill_closest_value_all_columns(full_external_conditions)

  date_range = pd.date_range(start=external_conditions['date'].min(), end=external_conditions['date'].max(), freq='H')
  closest_value = non_nan_values.iloc[(non_nan_values - value).abs().argmin()]


In [5]:
# Merge the DataFrames
merged_conditions = pd.merge(data, filled_external_conditions, on='date', how='left')

merged_conditions = utils._column_rename(merged_conditions)


merged_conditions_test = pd.merge(data_test, filled_external_conditions, on='date', how='left')

merged_conditions_test = utils._column_rename(merged_conditions_test)

In [6]:
# Ensure "date" is in datetime format
merged_conditions["date"] = pd.to_datetime(merged_conditions["date"], errors="coerce")

# Drop rows with invalid datetime entries
df = merged_conditions.dropna(subset=["date"])

# Extract date and time features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["weekday"] = df["date"].dt.dayofweek
df["day"] = df["date"].dt.day
df["hour"] = df["date"].dt.hour
df["is_weekend"] = (df["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates = df["date"].dt.date.unique()
d = SchoolHolidayDates()
f = JoursFeries()

try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    df["is_school_holiday"] = df["date"].dt.date.map(dict_school_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df["is_school_holiday"] = 0

try:
    dict_public_holidays = {date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates}
    df["is_public_holiday"] = df["date"].dt.date.map(dict_public_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df["is_public_holiday"] = 0

# Ensure "date" is in datetime format
merged_conditions_test["date"] = pd.to_datetime(merged_conditions_test["date"], errors="coerce")

# Drop rows with invalid datetime entries
df_test = merged_conditions_test.dropna(subset=["date"])

# Extract date and time features
df_test["year"] = df_test["date"].dt.year
df_test["month"] = df_test["date"].dt.month
df_test["weekday"] = df_test["date"].dt.dayofweek
df_test["day"] = df_test["date"].dt.day
df_test["hour"] = df_test["date"].dt.hour
df_test["is_weekend"] = (df_test["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates = df_test["date"].dt.date.unique()
d = SchoolHolidayDates()
f = JoursFeries()

try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    df_test["is_school_holiday"] = df_test["date"].dt.date.map(dict_school_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df_test["is_school_holiday"] = 0

try:
    dict_public_holidays = {date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates}
    df_test["is_public_holiday"] = df_test["date"].dt.date.map(dict_public_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df_test["is_public_holiday"] = 0

In [7]:
# Drop the columns from the df and df_test dataframes using the columns_to_drop list from the utils file
columns_to_drop = utils.columns_to_drop
df = df.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

In [8]:
TableReport(df)

Processing column  22 / 22


Unnamed: 0_level_0,counter_name,bike_count,latitude,longitude,log_bike_count,Pressure_Tendency_(hPa/3h),Wind_Speed_(m/s),Air_Temperature_(°C),Relative_Humidity_(%),Visibility_(m),Total_Cloud_Cover_(oktas),Snow_Height_(cm),"Rainfall_(3h,_mm)","Rainfall_(12h,_mm)",year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday
Unnamed: 0_level_1,counter_name,bike_count,latitude,longitude,log_bike_count,Pressure_Tendency_(hPa/3h),Wind_Speed_(m/s),Air_Temperature_(°C),Relative_Humidity_(%),Visibility_(m),Total_Cloud_Cover_(oktas),Snow_Height_(cm),"Rainfall_(3h,_mm)","Rainfall_(12h,_mm)",year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday
0.0,152 boulevard du Montparnasse E-O,4.0,48.840801,2.333233,1.6094379124341005,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0
1.0,152 boulevard du Montparnasse O-E,3.0,48.840801,2.333233,1.3862943611198906,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0
2.0,18 quai de l'Hôtel de Ville NO-SE,0.0,48.85372,2.35702,0.0,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0
3.0,18 quai de l'Hôtel de Ville SE-NO,1.0,48.85372,2.35702,0.6931471805599453,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0
4.0,20 Avenue de Clichy NO-SE,7.0,48.88529,2.32666,2.079441541679836,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0
,,,,,,,,,,,,,,,,,,,,,,
496822.0,Totem 85 quai d'Austerlitz SE-NO,42.0,48.84201,2.36729,3.7612001156935615,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,3.0,9.0,23.0,0.0,0.0,0.0
496823.0,Totem Cours la Reine E-O,22.0,48.86462,2.31444,3.1354942159291497,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,3.0,9.0,23.0,0.0,0.0,0.0
496824.0,Totem Cours la Reine O-E,32.0,48.86462,2.31444,3.49650756146648,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,3.0,9.0,23.0,0.0,0.0,0.0
496825.0,Voie Georges Pompidou NE-SO,9.0,48.8484,2.27586,2.302585092994046,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,3.0,9.0,23.0,0.0,0.0,0.0

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,counter_name,CategoricalDtype,0 (0.0%),56 (< 0.1%),,,,,
1,bike_count,Float64DType,0 (0.0%),,60.2,87.6,0.0,29.0,1300.0
2,latitude,Float64DType,0 (0.0%),,48.9,0.0186,48.8,48.9,48.9
3,longitude,Float64DType,0 (0.0%),,2.35,0.038,2.27,2.35,2.41
4,log_bike_count,Float64DType,0 (0.0%),,3.08,1.66,0.0,3.4,7.17
5,Pressure_Tendency_(hPa/3h),Float64DType,0 (0.0%),,-40.1,75.2,-750.0,-60.0,620.0
6,Wind_Speed_(m/s),Float64DType,0 (0.0%),,4.31,1.22,0.0,4.6,12.7
7,Air_Temperature_(°C),Float64DType,0 (0.0%),,286.0,4.08,268.0,286.0,307.0
8,Relative_Humidity_(%),Float64DType,0 (0.0%),,77.5,10.2,24.0,79.0,100.0
9,Visibility_(m),Float64DType,0 (0.0%),,11300.0,8540.0,120.0,7000.0,60000.0

Column 1,Column 2,Cramér's V
weekday,is_weekend,1.0
year,month,0.832
month,is_school_holiday,0.709
bike_count,log_bike_count,0.584
latitude,longitude,0.562
counter_name,longitude,0.418
counter_name,latitude,0.397
Air_Temperature_(°C),Relative_Humidity_(%),0.37
"Rainfall_(3h,_mm)","Rainfall_(12h,_mm)",0.361
Air_Temperature_(°C),Snow_Height_(cm),0.34


In [9]:
TableReport(df_test)

Processing column  20 / 20


Unnamed: 0_level_0,counter_name,latitude,longitude,Pressure_Tendency_(hPa/3h),Wind_Speed_(m/s),Air_Temperature_(°C),Relative_Humidity_(%),Visibility_(m),Total_Cloud_Cover_(oktas),Snow_Height_(cm),"Rainfall_(3h,_mm)","Rainfall_(12h,_mm)",year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday
Unnamed: 0_level_1,counter_name,latitude,longitude,Pressure_Tendency_(hPa/3h),Wind_Speed_(m/s),Air_Temperature_(°C),Relative_Humidity_(%),Visibility_(m),Total_Cloud_Cover_(oktas),Snow_Height_(cm),"Rainfall_(3h,_mm)","Rainfall_(12h,_mm)",year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday
0.0,152 boulevard du Montparnasse E-O,48.840801,2.333233,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0
1.0,152 boulevard du Montparnasse O-E,48.840801,2.333233,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0
2.0,18 quai de l'Hôtel de Ville NO-SE,48.85372,2.35702,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0
3.0,18 quai de l'Hôtel de Ville SE-NO,48.85372,2.35702,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0
4.0,20 Avenue de Clichy NO-SE,48.88529,2.32666,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0
,,,,,,,,,,,,,,,,,,,,
51435.0,Totem 85 quai d'Austerlitz SE-NO,48.84201,2.36729,0.0,3.6,288.35,95.0,10000.0,100.0,0.0,1.8,1.8,2021.0,10.0,0.0,18.0,21.0,0.0,0.0,0.0
51436.0,Totem Cours la Reine E-O,48.86462,2.31444,0.0,3.6,288.35,95.0,10000.0,100.0,0.0,1.8,1.8,2021.0,10.0,0.0,18.0,21.0,0.0,0.0,0.0
51437.0,Totem Cours la Reine O-E,48.86462,2.31444,0.0,3.6,288.35,95.0,10000.0,100.0,0.0,1.8,1.8,2021.0,10.0,0.0,18.0,21.0,0.0,0.0,0.0
51438.0,Voie Georges Pompidou NE-SO,48.8484,2.27586,0.0,3.6,288.35,95.0,10000.0,100.0,0.0,1.8,1.8,2021.0,10.0,0.0,18.0,21.0,0.0,0.0,0.0

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,counter_name,CategoricalDtype,0 (0.0%),56 (0.1%),,,,,
1,latitude,Float64DType,0 (0.0%),,48.9,0.0186,48.8,48.9,48.9
2,longitude,Float64DType,0 (0.0%),,2.34,0.0383,2.27,2.35,2.41
3,Pressure_Tendency_(hPa/3h),Float64DType,0 (0.0%),,-39.0,74.7,-410.0,-60.0,680.0
4,Wind_Speed_(m/s),Float64DType,0 (0.0%),,4.01,1.28,0.0,4.6,9.8
5,Air_Temperature_(°C),Float64DType,0 (0.0%),,287.0,2.64,277.0,286.0,299.0
6,Relative_Humidity_(%),Float64DType,0 (0.0%),,78.6,8.72,40.0,79.0,99.0
7,Visibility_(m),Float64DType,0 (0.0%),,11600.0,8900.0,200.0,7000.0,57500.0
8,Total_Cloud_Cover_(oktas),Float64DType,0 (0.0%),,80.1,24.2,0.0,90.0,100.0
9,Snow_Height_(cm),Float64DType,0 (0.0%),,0.0,0.0,,,

Column 1,Column 2,Cramér's V
weekday,is_weekend,1.0
month,day,0.711
latitude,longitude,0.562
counter_name,longitude,0.478
day,is_weekend,0.476
"Rainfall_(3h,_mm)","Rainfall_(12h,_mm)",0.47
Wind_Speed_(m/s),"Rainfall_(3h,_mm)",0.435
counter_name,latitude,0.411
Pressure_Tendency_(hPa/3h),Wind_Speed_(m/s),0.4
Visibility_(m),"Rainfall_(3h,_mm)",0.397


Decide to remove site id, site name and counter id to just keep counter name to reduce complexity and the data as they all provide more or less the same information. Counter is more precise as we will be able to calculate the number of times a counter is used in a given site.

## Model training with Elastic Net (To find the best features)

Elastic net can handle multicolinearity and shrinks the less important features to zero. It is a combination of L1 and L2 regularization. It is a linear regression model trained with L1 and L2 prior as regularizer. This combination allows for learning a sparse model where few of the weights are non-zero like Lasso, while still maintaining the regularization properties of Ridge.

In [None]:
# Define the features and target variable
X = merged_data.drop(columns=[
                            'bike_count', 'log_bike_count',
                            'counter_id', 'site_id', 'site_name', 'counter_technical_id',
                            'coordinates',
                            'Station Number', 'Measurement Period Duration',
                            'date', 'Date and Time', 'counter_installation_date',
                    ])



y = merged_data['log_bike_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer with OneHotEncoder for 'counter_name' and SimpleImputer for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['counter_name'])
    ])

# Create a pipeline with the preprocessor, standard scaler, and ElasticNet regression
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# Fit the ElasticNet pipeline on the training data
elasticnet_pipeline.fit(X_train, y_train)

# Print the score of the ElasticNet model on the test data
print(f"ElasticNet model score: {elasticnet_pipeline.score(X_test, y_test)}")

# Output information about the ElasticNet model
elasticnet_coefficients = elasticnet_pipeline.named_steps['regressor'].coef_

# Get feature names after preprocessing
feature_names = (elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 elasticnet_pipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features

elasticnet_feature_importance = pd.Series(elasticnet_coefficients, index=feature_names).sort_values(ascending=False)

In [None]:
# Print the feature importances
print(elasticnet_feature_importance)

In [None]:
# Filter features with non-zero importance
non_zero_features = elasticnet_feature_importance[elasticnet_feature_importance != 0].index.tolist()
# Keep only the non-zero features in the merged dataset
# Keep all the variables apart from the columns which are derived from a one hot encoder
non_zero_features = [feature for feature in non_zero_features if not feature.startswith('counter_name_')]
merged_data_filtered = merged_data[['counter_name', 'bike_count', 'log_bike_count'] + non_zero_features]
test_merged_data_filtered = test_merged_data[['counter_name'] + non_zero_features]


# Display the new dataframe
merged_data_filtered

In [None]:
# Subtract 273 from all values in the "Air Temperature (°C)" column
merged_data_filtered.loc[:,'Air Temperature (°C)'] -= 273
test_merged_data_filtered.loc[:,'Air Temperature (°C)'] -= 273
merged_data_filtered
test_merged_data_filtered

In [None]:
from xgboost import XGBRegressor

# Define the features and target variable
X = merged_data_filtered.drop(columns=[
                            'bike_count', 'log_bike_count',
                    ])

y = merged_data_filtered['log_bike_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer with OneHotEncoder for 'counter_name' and SimpleImputer for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['counter_name'])
    ])

# Create a pipeline with the preprocessor, standard scaler (with_mean=False), and XGBRegressor
xgboostpipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),
    ('regressor', XGBRegressor())
])

# Fit the XGBRegressor pipeline on the training data
xgboostpipeline.fit(X_train, y_train)

# Print the score of the XGBRegressor model on the test data
print(f"XGBRegressor model score: {xgboostpipeline.score(X_test, y_test)}")

# Output information about the XGBRegressor model
xgboost_feature_importances = xgboostpipeline.named_steps['regressor'].feature_importances_

# Get feature names after preprocessing
feature_names = (xgboostpipeline.named_steps['preprocessor']
                 .transformers_[0][2].tolist() +  # numerical features
                 xgboostpipeline.named_steps['preprocessor']
                 .transformers_[1][1].get_feature_names_out(['counter_name']).tolist())  # one-hot encoded features


In [None]:
# Predict the log_bike_count for the test_merged_data_filtered dataframe
y_pred = xgboostpipeline.predict(test_merged_data_filtered)

# Display the dataframe with predictions
y_pred

In [None]:
submission = pd.DataFrame({
    'log_bike_count': y_pred
}).reset_index(drop=True)
submission.index.name = 'Id'

submission.to_csv('/Users/felix/Downloads/test.csv')

