In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from datetime import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F


In [3]:
df_weather = pd.read_csv('weather_pca_data.csv')

In [4]:
def convert_date(date_str):
    date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    jour_semaine = date_obj.strftime('%A')  # Jour de la semaine (nom complet)
    mois = date_obj.strftime('%B')  # Mois (nom complet)
    annee = date_obj.year  # Année
    return [jour_semaine, mois, annee]


In [11]:
def process_dataframe(dataframe, factor, is_train=True, datatrain=None):
    if(is_train):
        wait_time_in2h = dataframe['WAIT_TIME_IN_2H'].apply(lambda x: int(x/5))
    
    # Fill NaN values
    MAX_VALUES = dataframe.max()
    MIN_VALUES = dataframe.min()
    if(not(is_train)):
        print(f"I had {MAX_VALUES}")
        MAX_VALUES = datatrain.max()
        MIN_VALUES = datatrain.min()
        print(f"I have {MAX_VALUES}")

    dataframe["TIME_TO_PARADE_1"]   = dataframe["TIME_TO_PARADE_1"].fillna(factor*MAX_VALUES["TIME_TO_PARADE_1"])
    dataframe["TIME_TO_PARADE_2"]   = dataframe["TIME_TO_PARADE_2"].fillna(factor*MAX_VALUES["TIME_TO_PARADE_2"])
    dataframe["TIME_TO_NIGHT_SHOW"] = dataframe["TIME_TO_NIGHT_SHOW"].fillna(factor*MAX_VALUES["TIME_TO_NIGHT_SHOW"])

    dates = dataframe["DATETIME"]
    entities = dataframe["ENTITY_DESCRIPTION_SHORT"]
    if(is_train):
        dataframe.drop(["DATETIME", "ENTITY_DESCRIPTION_SHORT", "WAIT_TIME_IN_2H"], axis=1, inplace=True)
    else:
        dataframe.drop(["DATETIME", "ENTITY_DESCRIPTION_SHORT"], axis=1, inplace=True)

    # Normalization
    dataframe=(dataframe-dataframe.mean())/dataframe.std()

    dataframe["DATETIME"] = dates

    # Merge with weather data
    df_weather = pd.read_csv('weather_pca_data.csv')
    dataframe = pd.merge(dataframe, df_weather, on='DATETIME', how='left')

    # One hot encoding for date and time
    dates = dataframe["DATETIME"]
    times = []
    years = []
    months = []
    days_week = []

    for date in dates:
        day, month, year = convert_date(date[:10])
        years.append(year)
        months.append(month)
        days_week.append(day)
        time = date[11:16]
        times.append(int(time[:2]) * 4 + int(time[3:5]) // 15)

    times = np.array(times) - np.min(times)

    encoded_years = pd.get_dummies(years)
    encoded_months = pd.get_dummies(months)
    encoded_days_week = pd.get_dummies(days_week)
    encoded_times = pd.get_dummies(times)

    # One hot encoding for entities
    encoded_entities = pd.get_dummies(entities)

    dataframe = pd.concat([dataframe, encoded_years, encoded_months, encoded_days_week, encoded_times, encoded_entities], axis=1)

    if(is_train):
        dataframe = pd.concat([dataframe, wait_time_in2h], axis=1)
        dataframe.drop(["DATETIME"], axis=1, inplace=True)
    else:
        dataframe = pd.concat([dataframe, entities], axis=1)

     # Identifier les lignes avec des valeurs NaN
    nan_rows = dataframe[dataframe.isna().any(axis=1)]
    print("Lignes avec des valeurs NaN :", nan_rows.index.tolist())
    
    return dataframe


In [15]:
df_train = pd.read_csv('for_students/waiting_times_train.csv')
df_train_processed = process_dataframe(df_train, 1)
df_train_processed.to_csv('processed_waiting_times_train.csv', index=False)
df_train_processed

Lignes avec des valeurs NaN : []


Unnamed: 0,ADJUST_CAPACITY,DOWNTIME,CURRENT_WAIT_TIME,TIME_TO_PARADE_1,TIME_TO_PARADE_2,TIME_TO_NIGHT_SHOW,PC1,PC2,PC3,PC4,...,42,43,44,45,46,47,Flying Coaster,Pirate Ship,Water Ride,WAIT_TIME_IN_2H
0,-0.660639,-0.168584,-0.240147,1.003495,0.394584,1.001307,-1.703404,-0.561189,-0.521895,0.099553,...,False,False,False,False,False,False,False,False,True,6
1,-0.660639,-0.168584,0.468341,0.212115,0.394584,-0.340547,-0.983224,-3.370968,1.688497,0.822912,...,False,False,False,False,False,False,False,False,True,5
2,-0.527907,-0.168584,0.822585,1.003495,0.394584,1.001307,1.971007,0.491093,-0.446060,-1.375623,...,False,False,False,False,False,False,False,True,False,7
3,-0.726608,-0.168584,-0.594390,-1.857648,0.394584,-1.507377,-1.153047,-1.460163,0.761943,0.267117,...,False,True,False,False,False,False,False,True,False,2
4,-1.033079,-0.168584,-0.594390,1.003495,0.394584,1.001307,0.644351,2.261365,0.670276,0.539488,...,False,False,False,False,False,False,False,True,False,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37013,1.356085,-0.168584,0.468341,-0.822766,0.394584,-0.515571,2.474459,-0.242590,-1.250092,0.433454,...,False,False,False,False,False,False,True,False,False,2
37014,-0.527907,-0.168584,-0.594390,-0.274888,-1.504219,-0.690596,1.451294,-0.740837,-0.309993,-0.850829,...,False,False,False,False,False,False,False,True,False,4
37015,1.356085,-0.168584,-0.594390,-1.979399,-4.099250,-1.740743,-3.211167,-0.929818,-1.205296,-0.270431,...,False,True,False,False,False,False,True,False,False,2
37016,1.356085,-0.168584,0.468341,-0.335763,0.394584,-0.632254,0.588498,2.259265,2.034889,-0.714293,...,False,False,False,False,False,False,True,False,False,9


In [16]:
df_val = pd.read_csv('for_students/waiting_times_X_test_val.csv')
df_val_processed = process_dataframe(df_val, 1, is_train=False, datatrain=df_train)
df_val_processed.to_csv('processed_waiting_times_val.csv', index=False)
df_val_processed

I had DATETIME                    2022-08-16 20:45:00
ENTITY_DESCRIPTION_SHORT             Water Ride
ADJUST_CAPACITY                           756.0
DOWNTIME                                     14
CURRENT_WAIT_TIME                            90
TIME_TO_PARADE_1                          510.0
TIME_TO_PARADE_2                          255.0
TIME_TO_NIGHT_SHOW                        840.0
dtype: object
I have ADJUST_CAPACITY       756.0
DOWNTIME               14.0
CURRENT_WAIT_TIME     155.0
TIME_TO_PARADE_1      570.0
TIME_TO_PARADE_2      235.0
TIME_TO_NIGHT_SHOW    840.0
dtype: float64
Lignes avec des valeurs NaN : []


Unnamed: 0,ADJUST_CAPACITY,DOWNTIME,CURRENT_WAIT_TIME,TIME_TO_PARADE_1,TIME_TO_PARADE_2,TIME_TO_NIGHT_SHOW,DATETIME,PC1,PC2,PC3,...,42,43,44,45,46,47,Flying Coaster,Pirate Ship,Water Ride,ENTITY_DESCRIPTION_SHORT
0,-0.640132,-0.149953,-0.366211,0.111886,-0.778230,0.255583,2019-11-23 10:45:00,-0.723665,3.844662,0.243095,...,False,False,False,False,False,False,False,False,True,Water Ride
1,-1.018377,-0.149953,1.415277,0.931726,0.359178,0.926896,2022-01-03 16:45:00,-0.451292,2.017454,0.058099,...,False,False,False,False,False,False,False,True,False,Pirate Ship
2,-0.607941,-0.149953,1.058979,0.931726,0.359178,0.926896,2021-12-04 15:30:00,-1.309027,1.426940,0.922631,...,False,False,False,False,False,False,False,True,False,Pirate Ship
3,-0.640132,-0.149953,-0.722508,-0.518761,0.359178,-1.087041,2020-02-05 13:15:00,-0.871256,-1.400221,-1.044924,...,False,False,False,False,False,False,False,False,True,Water Ride
4,1.408022,-0.149953,0.702682,-0.897149,0.359178,-0.598814,2022-05-13 15:15:00,2.677412,-1.899325,-0.008500,...,False,False,False,False,False,False,True,False,False,Flying Coaster
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2439,-0.640132,-0.149953,0.346384,-0.771019,-2.271078,-0.720871,2019-10-29 14:45:00,-0.326194,0.042633,-0.088452,...,False,False,False,False,False,False,False,False,True,Water Ride
2440,-0.640132,-0.149953,-0.722508,-1.212472,-3.017502,-1.636296,2019-10-08 16:30:00,0.611443,1.630446,-1.367938,...,False,False,False,False,False,False,False,False,True,Water Ride
2441,-0.640132,-0.149953,-0.366211,-0.834084,0.359178,-0.537786,2022-04-09 15:00:00,-0.025587,-0.741237,-0.984157,...,False,False,False,False,False,False,False,False,True,Water Ride
2442,1.408022,-0.149953,0.346384,-0.897149,0.359178,-0.598814,2022-06-16 15:15:00,4.050900,-1.375022,0.451623,...,False,False,False,False,False,False,True,False,False,Flying Coaster
