In [11]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from typing import Callable

In [12]:
train = pd.read_parquet(Path("data") / "train.parquet")
# train.head()

In [13]:
test = pd.read_parquet(Path("data") / "final_test.parquet")
#test.head()

In [14]:
test.count()

counter_id                   51440
counter_name                 51440
site_id                      51440
site_name                    51440
date                         51440
counter_installation_date    51440
coordinates                  51440
counter_technical_id         51440
latitude                     51440
longitude                    51440
dtype: int64

In [15]:
train["date"].min()

Timestamp('2020-09-01 01:00:00')

In [16]:
train["date"].max()

Timestamp('2021-09-09 23:00:00')

In [17]:
test["date"].min()

Timestamp('2021-09-10 01:00:00')

In [18]:
def encode_dates(df: pd.DataFrame):
    df = df.copy()  # modify a copy of df
    # Encode the date information from the DateOfDeparture columns
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["weekday"] = df["date"].dt.weekday
    df["hour"] = df["date"].dt.hour
    df["week"] = df["date"].dt.isocalendar().week

    # Finally we can drop the original columns from the dataframe
    return df

In [19]:
df_train = encode_dates(df=train)

period_start_2020 = df_train["date"].min()
period_end_2020 = pd.to_datetime("2020-09-09 23:00:00")

period_start_2021 = pd.to_datetime("2021-09-01 01:00:00")
period_end_2021 = df_train["date"].max()

df_period_2020 = df_train[
    df_train["date"].between(period_start_2020, period_end_2020)
].copy()
df_period_2021 = df_train[
    df_train["date"].between(period_start_2021, period_end_2021)
].copy()

df_reduced_2020 = df_period_2020[
    ["counter_id", "log_bike_count"]
]
df_reduced_2021 = df_period_2021[
    ["counter_id", "log_bike_count"]
]


In [20]:
growth = df_reduced_2021["log_bike_count"].mean() * 100 / df_reduced_2020["log_bike_count"].mean()
print(growth)

99.48693046012367


les chiffres sont sensiblement les mêmes en fait. Il y a meme moins de velo à rouler en sept 2021 qu'en sept 2020.

In [21]:
period_start_2021 = test["date"].min()
period_end_2021 = test["date"].max()
print(period_start_2021, period_end_2021)

2021-09-10 01:00:00 2021-10-18 21:00:00


In [22]:
period_start_2020 = pd.to_datetime("2020-09-10 01:00:00")
period_end_2020 = pd.to_datetime("2020-10-18 21:00:00")

In [23]:
train_period = train[
    train["date"].between(period_start_2020, period_end_2020)
].copy()
train_period.count()

counter_id                   50381
counter_name                 50381
site_id                      50381
site_name                    50381
bike_count                   50381
date                         50381
counter_installation_date    50381
coordinates                  50381
counter_technical_id         50381
latitude                     50381
longitude                    50381
log_bike_count               50381
dtype: int64

In [39]:
def prediction(df_train: pd.DataFrame, df_test: pd.DataFrame) -> pd.DataFrame:
    df_train = encode_dates(df=df_train)
    df_test = encode_dates(df=df_test)

    df_train["key"] = (
        df_train["month"].astype(str).str.zfill(2)
        + "-"
        + df_train["day"].astype(str).str.zfill(2)
        + " "
        + df_train["hour"].astype(str).str.zfill(2)
    )
    df_test["key"] = (
        df_test["month"].astype(str).str.zfill(2)
        + "-"
        + df_test["day"].astype(str).str.zfill(2)
        + " "
        + df_test["hour"].astype(str).str.zfill(2)
    )

    df_train = df_train[["key", "counter_id", "log_bike_count"]]
    df_test = df_test[["key", "counter_id"]]

    df_test_preds = pd.merge(
        left=df_test, right=df_train, on=["counter_id", "key"], how = "left"
    )

    df_test_preds["log_bike_count"] = df_test_preds["log_bike_count"].fillna(0)

    return df_test_preds

In [25]:
df_test_preds = prediction(df_test=test, df_train=train)

In [26]:
df_test_preds.head(10)

Unnamed: 0,key,counter_id,log_bike_count
0,09-10 01,100007049-102007049,0.693147
1,09-10 13,100007049-102007049,1.386294
2,09-10 17,100007049-102007049,2.079442
3,09-10 19,100007049-102007049,1.098612
4,09-10 22,100007049-102007049,0.0
5,09-11 00,100007049-102007049,0.0
6,09-11 01,100007049-102007049,0.0
7,09-11 03,100007049-102007049,0.693147
8,09-11 04,100007049-102007049,0.693147
9,09-11 06,100007049-102007049,1.609438


In [27]:
df_test_preds["log_bike_count"].isnull().sum()

np.int64(1866)

In [28]:
df_test_preds.count()

key               51440
counter_id        51440
log_bike_count    49574
dtype: int64

In [29]:
df_test_preds.sort_values(by=["key"], ascending=True, inplace=True)
df_test_preds.bfill(inplace=True)
df_test_preds["log_bike_count"].isnull().sum()
df_test_preds.count()

key               51440
counter_id        51440
log_bike_count    51440
dtype: int64

In [30]:
print(df_test_preds.head())

            key           counter_id  log_bike_count
0      09-10 01  100007049-102007049        0.693147
2771   09-10 01  100036718-103036718        1.609438
42536  09-10 01  100057380-104057380        0.693147
23011  09-10 01         100056223-SC        0.693147
23933  09-10 01  100056226-104056226        0.693147


In [31]:
df_submition = df_test_preds[["log_bike_count"]].copy()
df_submition.index.name = "Id"
df_submition.head()

Unnamed: 0_level_0,log_bike_count
Id,Unnamed: 1_level_1
0,0.693147
2771,1.609438
42536,0.693147
23011,0.693147
23933,0.693147


In [32]:
df_submition.to_csv("/Users/gregzguegue/Downloads/naive_baseline_2.csv", index=True)

In [40]:
def submition(
    df_train: pd.DataFrame, df_test: pd.DataFrame, prediction: Callable[[pd.DataFrame, pd.DataFrame], pd.DataFrame], csv_file_name: str
) -> None:
    df_test_preds = prediction(df_test=df_test, df_train=df_train)
    NaN_sum = df_test_preds["log_bike_count"].isnull().sum()
    nb_index = 51_440

    if df_test_preds.shape[0] != nb_index:
        raise ValueError("The DataFrame does not have the right number of rows.")
    if NaN_sum > 0:
        raise ValueError("The DataFrame contains missing values. Handle them before submission.")
    
    df_submition = df_test_preds[["log_bike_count"]].copy()
    df_submition.index.name = "Id"
    df_submition.to_csv(
        f"/Users/gregzguegue/Downloads/{csv_file_name}.csv", index=True
    )

In [41]:
submition(df_train=train, df_test=test, prediction=prediction, csv_file_name="test_function")