# Etude de cas - LivraisonCo

## Requirements

In [12]:
import pandas as pd
import functools

## Data cleansing

In [123]:
def baseline_clean(path_list, node_dict):
  # Reading csv files
  df_list = [pd.read_csv(path, delimiter=';') for path in path_list]
  df = pd.concat(df_list,ignore_index=True)

  # filtering by node criteria
  criteria_dict = []

  for key, value in node_dict.items():
    criterion = (df["Libelle"]==key) & (df["Libelle noeud amont"]==value[0]) & (df["Libelle noeud aval"]==value[1])
    criteria_dict.append(criterion)

  criteria = functools.reduce(lambda x,y: x|y, criteria_dict)
  df = df[criteria] 

  # Filling missing values
  df["Débit horaire"] = df["Débit horaire"].interpolate()
  df["Taux d'occupation"] = df["Taux d'occupation"].interpolate()

  # Converting datetime to local timezone (Paris)
  df['Date et heure de comptage'] = pd.to_datetime(df['Date et heure de comptage'],utc=True) \
                                            .dt.tz_convert('Europe/Paris') \
                                            .dt.tz_localize(None)

  # One-hot encoding categorical values
  trafic = pd.get_dummies(df["Etat trafic"], prefix="Etat trafic")
  arc = pd.get_dummies(df["Etat arc"], prefix="Etat arc")

  df = pd.concat([df, trafic, arc], axis=1)

  # Dropping irrelevant columns
  df = df.drop(columns=[
      "Identifiant arc",
      "Identifiant noeud amont",
      "Identifiant noeud aval",
      "Date debut dispo data",
      "Date fin dispo data",
      "geo_point_2d",
      "geo_shape",
      "Libelle noeud amont",
      "Libelle noeud aval",
      "Etat trafic", 
      "Etat arc"
      ])
  
  df.reset_index(drop=True, inplace=True)

  return df

In [124]:
path_champs = "/content/drive/MyDrive/CentraleSupelec/3A/BCG_datathon/Data/champs-elysees.csv"
path_convention = "/content/drive/MyDrive/CentraleSupelec/3A/BCG_datathon/Data/convention.csv"
path_sts = "/content/drive/MyDrive/CentraleSupelec/3A/BCG_datathon/Data/sts-peres.csv"

path_list = [path_champs, path_convention, path_sts]

node_dict = {
    'AV_Champs_Elysees': ('Av_Champs_Elysees-Washington', 'Av_Champs_Elysees-Berri'),
    'Convention': ('Lecourbe-Convention', 'Convention-Blomet'),
    'Sts_Peres': ('Sts_Peres-Voltaire', 'Sts_Peres-Universite')
}

df_cleaned = baseline_clean(path_list, node_dict)

In [127]:
df_cleaned.to_csv("/content/drive/MyDrive/CentraleSupelec/3A/BCG_datathon/Data/clean_baseline.csv",
                  sep = ";",
                  index=False)

In [125]:
df_cleaned

Unnamed: 0,Libelle,Date et heure de comptage,Débit horaire,Taux d'occupation,Etat trafic_Bloqué,Etat trafic_Fluide,Etat trafic_Inconnu,Etat trafic_Pré-saturé,Etat trafic_Saturé,Etat arc_Barré,Etat arc_Invalide
0,AV_Champs_Elysees,2022-01-01 11:00:00,524.0,8.85667,0,1,0,0,0,0,1
1,AV_Champs_Elysees,2022-01-01 07:00:00,491.0,7.71611,0,1,0,0,0,0,1
2,AV_Champs_Elysees,2022-01-01 19:00:00,997.0,46.28889,0,0,0,0,1,0,1
3,AV_Champs_Elysees,2022-01-01 18:00:00,1096.0,46.60222,0,0,0,0,1,0,1
4,AV_Champs_Elysees,2022-01-01 09:00:00,258.0,3.75445,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
27543,Sts_Peres,2022-10-31 14:00:00,428.0,6.23000,0,1,0,0,0,0,1
27544,Sts_Peres,2022-08-01 04:00:00,83.0,0.99222,0,1,0,0,0,0,1
27545,Sts_Peres,2022-08-01 03:00:00,143.0,1.72778,0,1,0,0,0,0,1
27546,Sts_Peres,2022-08-01 02:00:00,190.0,2.22778,0,1,0,0,0,0,1
