In [66]:
import pandas as pd
from xgboost import XGBRegressor
import holidays
from pathlib import Path
import numpy as np

In [None]:
target_col = 'log_bike_count'

columns_to_drop = ['bike_count', 'log_bike_count', 'coordinates', 'counter_id', 'site_id', 'counter_installation_date'] #
                   #'counter_technical_id', 'site_name']

def get_model_data(path='data/train.parquet'):

    data = pd.read_parquet(path)
    data.sort_values(['date', 'counter_name'], inplace=True)
    y = data[target_col].values
    X = data.drop(columns_to_drop, axis=1)

    return X, y

In [69]:
def covid_period(date):
    confinement_start = pd.Timestamp('2020-10-30')
    confinement_end = pd.Timestamp('2020-12-15')
    couvre_feu_1_start = pd.Timestamp('2020-12-15')
    couvre_feu_1_end = pd.Timestamp('2021-01-15')
    couvre_feu_2_start = pd.Timestamp('2021-01-16')
    couvre_feu_2_end = pd.Timestamp('2021-06-20')
    if confinement_start <= date <= confinement_end:
        return 1  # lockdown
    elif couvre_feu_1_start <= date <= couvre_feu_1_end:
        return 2  # first curfew
    elif couvre_feu_2_start <= date <= couvre_feu_2_end:
        return 3  # second curfew
    else:
        return 0

def _encode_date(date): 
    date = date.copy()
    date['month_day'] = date['date'].dt.day
    date['week_day'] = date['date'].dt.day_of_week + 1
    date['year'] = date['date'].dt.year
    date['month'] = date['date'].dt.month
    date['hour'] = date['date'].dt.hour
    years = date['year'].drop_duplicates().values.tolist()
    french_holidays = set(holidays.country_holidays('FR', years=years))
    date['is_holiday'] = (date['date']
                        .dt.date
                        .isin(french_holidays)
                        .astype(int))
    date['covid_state'] = date['date'].apply(covid_period)

    return date.drop(columns= 'date')


In [70]:
def _merge_external_data(X):
    file_path = Path(__file__).parent / "external_data//external_data.csv"

    df_ext = pd.read_csv(file_path, parse_dates=["date"])
    df_ext['date'] = pd.to_datetime(df_ext['date']).astype('datetime64[us]')

    X = X.copy()
    X["orig_index"] = np.arange(X.shape[0])
    X = pd.merge_asof(
        X.sort_values("date"), df_ext[["date", "t"]].sort_values("date"), on="date", direction='nearest',
    )
    X = X.sort_values("orig_index")
    del X["orig_index"]
    return X

In [71]:
X, y = get_model_data()
X = _encode_date(X)

In [79]:
cols_to_encode

Index(['counter_name', 'site_name', 'counter_technical_id', 'month_day',
       'week_day', 'year', 'month', 'hour', 'is_holiday', 'covid_state'],
      dtype='object')

In [72]:
cols_to_encode = X.columns[~X.columns.isin(['latitude', 'longitude'])]
encoded_cols = pd.get_dummies(X[cols_to_encode], drop_first=True)
non_encoded_cols = X[['latitude', 'longitude']]

In [73]:
X_new = pd.concat([non_encoded_cols, encoded_cols], axis=1)

In [None]:
model = XGBRegressor(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=10,
    random_state=42,
    tree_method='hist'
)

In [75]:
model.fit(X_new, y)

In [76]:
X_test = pd.read_parquet('data/final_test.parquet')
X_test.drop(columns=['coordinates', 'counter_id', 'site_id', 'counter_installation_date'], inplace=True)
X_test

Unnamed: 0,counter_name,site_name,date,counter_technical_id,latitude,longitude
0,28 boulevard Diderot E-O,28 boulevard Diderot,2021-09-10 01:00:00,Y2H15027244,48.846028,2.375429
1,28 boulevard Diderot E-O,28 boulevard Diderot,2021-09-10 13:00:00,Y2H15027244,48.846028,2.375429
2,28 boulevard Diderot E-O,28 boulevard Diderot,2021-09-10 17:00:00,Y2H15027244,48.846028,2.375429
3,28 boulevard Diderot E-O,28 boulevard Diderot,2021-09-10 19:00:00,Y2H15027244,48.846028,2.375429
4,28 boulevard Diderot E-O,28 boulevard Diderot,2021-09-10 22:00:00,Y2H15027244,48.846028,2.375429
...,...,...,...,...,...,...
51435,254 rue de Vaugirard SO-NE,254 rue de Vaugirard,2021-10-18 11:00:00,Y2H20114504,48.839770,2.301980
51436,254 rue de Vaugirard SO-NE,254 rue de Vaugirard,2021-10-18 15:00:00,Y2H20114504,48.839770,2.301980
51437,254 rue de Vaugirard SO-NE,254 rue de Vaugirard,2021-10-18 17:00:00,Y2H20114504,48.839770,2.301980
51438,254 rue de Vaugirard SO-NE,254 rue de Vaugirard,2021-10-18 18:00:00,Y2H20114504,48.839770,2.301980


In [77]:

X_test = _encode_date(X_test)
cols_to_encode = X_test.columns[~X_test.columns.isin(['latitude', 'longitude'])]
encoded_cols = pd.get_dummies(X_test[cols_to_encode], drop_first=True)
non_encoded_cols = X_test[['latitude', 'longitude']]
X_test_new = pd.concat([non_encoded_cols, encoded_cols], axis=1)

In [78]:
test_pred = model.predict(X_test_new)

test_df = pd.DataFrame({
    'Id': range(0, len(test_pred)), 
    'log_bike_count': test_pred           
})

test_df.to_csv('xgb_raw2.csv', index=False)

In [80]:
import bike_count as bc

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [88]:
import pandas as pd

# Creating a small dataset similar to the provided format
data_example = {
    "counter_name": [
        "28 boulevard Diderot E-O", 
        "28 boulevard Diderot E-O", 
        "28 boulevard Diderot E-O", 
        "28 boulevard Diderot E-O"
    ],
    "site_name": [
        "28 boulevard Diderot", 
        "28 boulevard Diderot", 
        "28 boulevard Diderot", 
        "28 boulevard Diderot"
    ],
    "date": [
        "2021-09-10 01:00:00", 
        "2021-09-10 13:00:00", 
        "2021-09-10 17:00:00", 
        "2021-09-10 19:00:00"
    ],
    "counter_technical_id": [
        "Y2H15027244", 
        "Y2H15027244", 
        "Y2H15027244", 
        "Y2H15027244"
    ],
    "latitude": [48.846028, 48.846028, 48.846028, 48.846028],
    "longitude": [2.375429, 2.375429, 2.375429, 2.375429]
}

df = pd.DataFrame(data_example)

In [95]:
df['date'] = pd.to_datetime(df['date']).astype('datetime64[us]')

In [116]:
X, y = bc.get_model_data()

In [118]:
X_merged = bc.merge.transform(X)
print("Après merge :")
print(X_merged.head())

Après merge :
             counter_id                       counter_name    site_id  \
0   100049407-353255860  152 boulevard du Montparnasse E-O  100049407   
30  100049407-353255859  152 boulevard du Montparnasse O-E  100049407   
31  100036719-104036719  18 quai de l'Hôtel de Ville NO-SE  100036719   
32  100036719-103036719  18 quai de l'Hôtel de Ville SE-NO  100036719   
33  100063175-353277233          20 Avenue de Clichy NO-SE  100063175   

                        site_name                date  \
0   152 boulevard du Montparnasse 2020-09-01 01:00:00   
30  152 boulevard du Montparnasse 2020-09-01 01:00:00   
31    18 quai de l'Hôtel de Ville 2020-09-01 01:00:00   
32    18 quai de l'Hôtel de Ville 2020-09-01 01:00:00   
33            20 Avenue de Clichy 2020-09-01 01:00:00   

   counter_installation_date         coordinates counter_technical_id  \
0                 2018-12-07  48.840801,2.333233          Y2H19070373   
30                2018-12-07  48.840801,2.333233          

In [119]:
X_date_encoded = bc.date_encoder.transform(X_merged)
print("Après date encoding :")
print(X_date_encoded.head())

Après date encoding :
             counter_id                       counter_name    site_id  \
0   100049407-353255860  152 boulevard du Montparnasse E-O  100049407   
30  100049407-353255859  152 boulevard du Montparnasse O-E  100049407   
31  100036719-104036719  18 quai de l'Hôtel de Ville NO-SE  100036719   
32  100036719-103036719  18 quai de l'Hôtel de Ville SE-NO  100036719   
33  100063175-353277233          20 Avenue de Clichy NO-SE  100063175   

                        site_name counter_installation_date  \
0   152 boulevard du Montparnasse                2018-12-07   
30  152 boulevard du Montparnasse                2018-12-07   
31    18 quai de l'Hôtel de Ville                2017-07-12   
32    18 quai de l'Hôtel de Ville                2017-07-12   
33            20 Avenue de Clichy                2020-07-22   

           coordinates counter_technical_id   latitude  longitude       t  \
0   48.840801,2.333233          Y2H19070373  48.840801   2.333233  285.75   
30  48

In [120]:
from sklearn.compose import ColumnTransformer

# Simulation de la transformation avec `preprocessor`
X_preprocessed = bc.preprocessor.fit_transform(X_date_encoded)
print("Après preprocessing :")
print(X_preprocessed)

Après preprocessing :
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5961924 stored elements and shape (496827, 201)>
  Coords	Values
  (0, 0)	0.01787946016926909
  (0, 2)	1.0
  (0, 8)	1.0
  (0, 18)	1.0
  (0, 23)	1.0
  (0, 46)	1.0
  (0, 48)	1.0
  (0, 52)	1.0
  (0, 83)	1.0
  (0, 85)	1.0
  (0, 141)	1.0
  (0, 184)	1.0
  (1, 0)	0.01787946016926909
  (1, 2)	1.0
  (1, 8)	1.0
  (1, 18)	1.0
  (1, 23)	1.0
  (1, 46)	1.0
  (1, 48)	1.0
  (1, 52)	1.0
  (1, 83)	1.0
  (1, 86)	1.0
  (1, 141)	1.0
  (1, 184)	1.0
  (2, 0)	0.01787946016926909
  :	:
  (496824, 199)	1.0
  (496825, 0)	0.8268866738077093
  (496825, 4)	1.0
  (496825, 9)	1.0
  (496825, 18)	1.0
  (496825, 45)	1.0
  (496825, 46)	1.0
  (496825, 48)	1.0
  (496825, 60)	1.0
  (496825, 83)	1.0
  (496825, 139)	1.0
  (496825, 170)	1.0
  (496825, 196)	1.0
  (496826, 0)	0.8268866738077093
  (496826, 4)	1.0
  (496826, 9)	1.0
  (496826, 18)	1.0
  (496826, 45)	1.0
  (496826, 46)	1.0
  (496826, 48)	1.0
  (496826, 60)	1.0
  (496826, 83)	1.0
  (4