In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from datetime import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split

In [2]:
USE_GPU = True
dtype = torch.float32 # we will be using float throughout this tutorial
device = torch.device('cuda') if (USE_GPU and torch.cuda.is_available()) else torch.device('cpu')

# Constant to control how frequently we print train loss
print_every = 100
print('using device:', device)

using device: cpu


In [3]:
try:
  from google.colab import drive
  drive.mount('/content/gdrive', force_remount=True)

  FOLDERNAME = '2A/HACKATHON'
  %cd /content/gdrive/My\ Drive/$FOLDERNAME
except ImportError:
  pass

Mounted at /content/gdrive
/content/gdrive/My Drive/2A/HACKATHON


In [4]:
%pwd

'/content/gdrive/My Drive/2A/HACKATHON'

In [3]:
df_train = pd.read_csv('processed_waiting_times_train.csv')

In [4]:
out_features = max(df_train["WAIT_TIME_IN_2H"]) + 1
print(out_features)

32


In [5]:
y = df_train["WAIT_TIME_IN_2H"]
print(y)

x = df_train
x.drop(["WAIT_TIME_IN_2H"], axis=1, inplace=True)
print(x)

0        6
1        5
2        7
3        2
4        2
        ..
37013    2
37014    4
37015    2
37016    9
37017    4
Name: WAIT_TIME_IN_2H, Length: 37018, dtype: int64
       ADJUST_CAPACITY  DOWNTIME  CURRENT_WAIT_TIME  TIME_TO_PARADE_1  \
0            -0.660639 -0.168584          -0.240147         -1.244963   
1            -0.660639 -0.168584           0.468341          1.069542   
2            -0.527907 -0.168584           0.822585         -1.244963   
3            -0.726608 -0.168584          -0.594390          0.290401   
4            -1.033079 -0.168584          -0.594390         -1.244963   
...                ...       ...                ...               ...   
37013         1.356085 -0.168584           0.468341          0.679971   
37014        -0.527907 -0.168584          -0.594390          0.886214   
37015         1.356085 -0.168584          -0.594390          0.244569   
37016         1.356085 -0.168584           0.468341          0.863299   
37017        -1.033079 -0

In [6]:
in_features = len(x.columns)
print(in_features)

87


In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=0)

In [8]:
x_train = x_train.astype(float)
x_test = x_test.astype(float)
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [9]:
x_train = torch.tensor(x_train.values, dtype=torch.float64, device=device)
x_test = torch.tensor(x_test.values, dtype=torch.float64, device=device)
y_train = torch.tensor(y_train.values, dtype=torch.int64, device=device)
y_test = torch.tensor(y_test.values, dtype=torch.int64, device=device)

In [10]:
y_train_one_hot = F.one_hot(y_train, out_features)
#y_test = F.one_hot(y_test, out_features)
print(y_train.shape)

torch.Size([36647])


In [12]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(x_train, y_train_one_hot)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

class MyModel(nn.Module):
    def __init__(self, in_features, out_features):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(in_features, 256)
        self.fc2 = nn.Linear(256, 512)
        self.fc3 = nn.Linear(512, 1024)
        self.fc4 = nn.Linear(1024, 128)
        self.fc5 = nn.Linear(128, out_features)
        self.relu = nn.ReLU()
        self.batchnorm1 = nn.BatchNorm1d(256)
        self.batchnorm2 = nn.BatchNorm1d(512)
        self.batchnorm3 = nn.BatchNorm1d(1024)
        self.batchnorm4 = nn.BatchNorm1d(128)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.dropout(self.relu(self.batchnorm1(self.fc1(x))))
        x = self.dropout(self.relu(self.batchnorm2(self.fc2(x))))
        x = self.dropout(self.relu(self.batchnorm3(self.fc3(x))))
        x = self.dropout(self.relu(self.batchnorm4(self.fc4(x))))
        x = self.fc5(x)
        return x

def train_model(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for inputs, targets in train_loader:
            inputs = inputs.to(device=device, dtype=torch.float)
            targets = targets.to(device=device, dtype=torch.float)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss += 0.02 * torch.linalg.norm((torch.argmax(outputs, dim=1) - torch.argmax(targets, dim=1)).to(dtype=torch.float))
            loss.backward()
            optimizer.step()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Utilisation de la classe MyModel
model = MyModel(in_features, out_features)
model.to(device=device)

weight_decay = 0.01
optimizer = optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss()

train_model(model, train_loader, criterion, optimizer, num_epochs=40)

Epoch [1/40], Loss: 2.4475
Epoch [2/40], Loss: 2.5781
Epoch [3/40], Loss: 2.2514
Epoch [4/40], Loss: 2.2292
Epoch [5/40], Loss: 2.0337
Epoch [6/40], Loss: 1.8483
Epoch [7/40], Loss: 1.9571
Epoch [8/40], Loss: 1.9098
Epoch [9/40], Loss: 2.1676
Epoch [10/40], Loss: 2.4238
Epoch [11/40], Loss: 2.0765
Epoch [12/40], Loss: 2.0424
Epoch [13/40], Loss: 2.1812
Epoch [14/40], Loss: 2.0702
Epoch [15/40], Loss: 2.2272
Epoch [16/40], Loss: 2.0210
Epoch [17/40], Loss: 2.1954
Epoch [18/40], Loss: 1.7422
Epoch [19/40], Loss: 1.8608
Epoch [20/40], Loss: 2.0615
Epoch [21/40], Loss: 1.9745
Epoch [22/40], Loss: 2.0695
Epoch [23/40], Loss: 1.9387
Epoch [24/40], Loss: 1.9982
Epoch [25/40], Loss: 1.9013
Epoch [26/40], Loss: 1.7959
Epoch [27/40], Loss: 1.9123
Epoch [28/40], Loss: 1.8166
Epoch [29/40], Loss: 1.8556
Epoch [30/40], Loss: 1.9516
Epoch [31/40], Loss: 1.9035
Epoch [32/40], Loss: 2.3007
Epoch [33/40], Loss: 1.8543
Epoch [34/40], Loss: 1.8859
Epoch [35/40], Loss: 1.8466
Epoch [36/40], Loss: 1.8497
E

In [13]:
from sklearn.svm import SVC

class KernelSVM:
    def __init__(self, kernel='rbf', C=2.0, gamma='scale'):
        self.kernel = kernel
        self.C = C
        self.gamma = gamma
        self.model = SVC(kernel=kernel, C=C, gamma=gamma)

    def train(self, X_train, y_train):
        X_train = X_train.detach().numpy()
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        X_test = X_test.detach().numpy()
        return self.model.predict(X_test)

class Identity(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x

model.fc5 = Identity()
model.eval()
x_train = x_train.to(device=device, dtype=torch.float)
x_train_bis = model(x_train)

svm = KernelSVM()
svm.train(x_train_bis, y_train)

In [14]:
from sklearn.ensemble import RandomForestClassifier

class RandomForestClassifierWrapper:
    def __init__(self, n_estimators=1000, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)

    def train(self, X_train, y_train):
        X_train = X_train.detach().numpy()
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        X_test = X_test.detach().numpy()
        return self.model.predict(X_test)

class Identity(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x

model.fc5 = Identity()
model.eval()
x_train = x_train.to(device=device, dtype=torch.float)
x_train_bis = model(x_train)

random_forest = RandomForestClassifierWrapper()
random_forest.train(x_train_bis, y_train)

In [15]:
x_test = x_test.to(device=device, dtype=torch.float)
y = torch.tensor(svm.predict(model(x_test)))
print(y[:30])
print(y_test[:30])

rmse = torch.sqrt(torch.sum((y * 5 - y_test * 5)**2) / len(y))
print(rmse)

a = (5 * y - 5 * y_test) ** 2
print(a[:40])

tensor([ 6,  8,  1,  5,  3,  5,  9,  1,  4, 10,  6,  6,  3,  4,  7,  1,  6,  4,
         4,  1,  3,  5,  0,  1,  2,  6, 11,  1,  6,  3])
tensor([ 7,  6,  1,  3,  3,  5,  9,  2,  5, 12,  5,  8,  3,  2,  3,  1,  4,  6,
         4,  3,  4,  5,  3,  3,  3,  7, 11,  6,  7,  3])
tensor(8.5467)
tensor([ 25, 100,   0, 100,   0,   0,   0,  25,  25, 100,  25, 100,   0, 100,
        400,   0, 100, 100,   0, 100,  25,   0, 225, 100,  25,  25,   0, 625,
         25,   0, 100, 225,   0, 625, 400,   0,   0,   0,  25,   0])


In [None]:
x_test = x_test.to(device=device, dtype=torch.float)
y = torch.tensor(random_forest.predict(model(x_test)))
print(y[:30])
print(y_test[:30])

rmse = torch.sqrt(torch.sum((y * 5 - y_test * 5)**2) / len(y))
print(rmse)

a = (5 * y - 5 * y_test) ** 2
print(a[:40])

In [16]:
x_val = pd.read_csv('processed_waiting_times_val.csv')
dates = x_val["DATETIME"]
entities = x_val["ENTITY_DESCRIPTION_SHORT"]
x_val.drop(["DATETIME", "ENTITY_DESCRIPTION_SHORT"], axis=1, inplace=True)
x_val = x_val.astype(float)
x_val = torch.tensor(x_val.values, dtype=torch.float64, device=device)
x_val = x_val.to(device=device, dtype=torch.float)
y_hat = torch.tensor(random_forest.predict(model(x_val)))
clean_y_hat = np.array((y_hat).cpu())*5
print(len(clean_y_hat))

res_df = pd.DataFrame(list(zip(clean_y_hat, entities, dates, ["Validation" for _ in range(len(dates))])), columns=["y_pred", "ENTITY_DESCRIPTION_SHORT", "DATETIME", "KEY"])
res_df.to_csv('res.csv', index=False)

2444


In [None]:
model.eval()
x_test = x_test.to(device=device, dtype=torch.float)
y_hat = torch.argmax(model(x_test), dim=1)
print(y_hat[:40] * 5)
print(y_test[:40] * 5)

def accuracy(y_hat, y_test):
  return torch.sum(y_hat == y_test) / len(y_hat)

print(accuracy(y_hat, y_test))

rmse = torch.sqrt(torch.sum((y_hat * 5 - y_test * 5)**2) / len(y_hat))
print(rmse)
a = (5 * y_hat - 5 * y_test) ** 2
print(a[:40])

tensor([25, 40,  5, 25, 15, 25, 45,  5, 20, 45, 30, 30, 20, 20, 35,  5, 30,  5,
        20,  5, 25, 20,  0, 10, 10, 30, 45, 25, 30, 15, 25, 35, 15, 55, 30, 10,
        30, 30, 30, 30])
tensor([35, 30,  5, 15, 15, 25, 45, 10, 25, 60, 25, 40, 15, 10, 15,  5, 20, 30,
        20, 15, 20, 25, 15, 15, 15, 35, 55, 30, 35, 15, 25, 50, 15, 30, 60, 10,
        30, 30, 35, 30])
tensor(0.3908)
tensor(9.6642)
tensor([100, 100,   0, 100,   0,   0,   0,  25,  25, 225,  25, 100,  25, 100,
        400,   0, 100, 625,   0, 100,  25,  25, 225,  25,  25,  25, 100,  25,
         25,   0,   0, 225,   0, 625, 900,   0,   0,   0,  25,   0])


In [None]:
model.eval()
x_val = pd.read_csv('processed_waiting_times_val.csv')
dates = x_val["DATETIME"]
entities = x_val["ENTITY_DESCRIPTION_SHORT"]
x_val.drop(["DATETIME", "ENTITY_DESCRIPTION_SHORT"], axis=1, inplace=True)
x_val = x_val.astype(float)
x_val = torch.tensor(x_val.values, dtype=torch.float64, device=device)
x_val = x_val.to(device=device, dtype=torch.float)
y_hat = torch.argmax(model(x_val), dim=1)
clean_y_hat = np.array((y_hat).cpu())*5
print(len(clean_y_hat))

res_df = pd.DataFrame(list(zip(clean_y_hat, entities, dates, ["Validation" for _ in range(len(dates))])), columns=["y_pred", "ENTITY_DESCRIPTION_SHORT", "DATETIME", "KEY"])
res_df.to_csv('res.csv', index=False)


2444
