In [65]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import csv

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim

In [66]:
# 1.1 Importer les données Train et Test

train_clean = pd.read_csv("/home/gabgab/dev/IA/deepLearning/data/Module3/train_clean.csv")

test_clean = pd.read_csv("/home/gabgab/dev/IA/deepLearning/data/Module3/test_clean.csv")


In [67]:
# 1.2 Depuis le Dataframe train, charger les features d'apprentissage dans un array numpy X_alltrain, et les labels (données à prévoir) dans un array numpy y_alltrain

X_alltrain = train_clean.values[:, 2:]
y_alltrain = train_clean.values[:, 1]

In [68]:
# 1.3 Séparer les features et les labels en deux parties (train et dev), en attribuant 10% des exemples aux données de dev. Afficher les nombres de lignes et de colonnes pour les 4 arrays.

X_train, X_dev, y_train, y_dev = train_test_split(X_alltrain, y_alltrain, test_size=0.1, random_state=42)

print("Dimensions de X_train :", X_train.shape)
print("Dimensions de y_train :", y_train.shape)
print("Dimensions de X_dev :", X_dev.shape)
print("Dimensions de y_dev :", y_dev.shape)

Dimensions de X_train : (801, 7)
Dimensions de y_train : (801,)
Dimensions de X_dev : (90, 7)
Dimensions de y_dev : (90,)


In [69]:
# 1.4 Afficher les 10 premières lignes de features et les 10 premiers labels.

print("10 premières lignes de X_train :\n", X_train[:10, :])

print("10 premiers éléments de y_train :\n", y_train[:10])


10 premières lignes de X_train :
 [[3 1 0 2 0 0 4]
 [3 0 0 3 0 0 2]
 [1 1 3 3 0 1 1]
 [3 1 2 0 2 1 1]
 [3 1 0 0 0 1 1]
 [3 1 1 0 0 1 1]
 [1 1 2 3 0 0 1]
 [3 1 2 1 0 1 1]
 [2 0 2 1 0 1 3]
 [3 1 1 0 0 1 1]]
10 premiers éléments de y_train :
 [1 0 0 0 0 0 1 1 1 0]


In [70]:
# 2.1 Définir et instancier une classe Titanic Model avec les caractéristiques suivantes :
#  - Deux couches cachées de 50 neurones.
#  - Deux classes en sortie : Survivant ou non
#  - Des fonctions d'activation RELU
#  - Un dropout paramétrable pour les 2 couches cachées

class TitanicModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate):
        super(TitanicModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

input_size = X_train.shape[1]  # Assuming X_train is a numpy array
hidden_size = 50
output_size = 2  # Assuming binary classification (survived or not)
dropout_rate = 0.5

titanic = TitanicModel(input_size, hidden_size, output_size, dropout_rate)

In [71]:
# 2.2 Définir des paramètres de nombre d'epochs (50) et de learning_rate (0.01)

num_epochs = 50
learning_rate = 0.01

In [72]:
# 2.3 Définir la taille du minibatch à 50. En déduire le nombre de boucles pour chaque epoch.

batch_size = 50
num_batches = len(X_train) // batch_size
num_steps_per_epoch = num_batches + 1 if len(X_train) % batch_size != 0 else num_batches

In [73]:
# 2.4 Définir une fonction de coût de type CrossEntropy

criterion = nn.CrossEntropyLoss()

In [74]:
# 2.5. Définir un optimizer de type Adam, sans oublier le learning rate

optimizer = optim.Adam(titanic.parameters(), learning_rate)

In [75]:
# 3.1 Exécuter l'apprentissage du modèle.
#  - Créer une boucle sur les epochs, qui contient elle-même une boucle sur les minibatchs.
#  - À chaque nouvelle itération sur les epochs, mélanger les données avec la méthode shuffle.
#  - Tous les 5 epochs afficher la valeur de la fonction de cout

loss_function = nn.CrossEntropyLoss()

num_iterations = len(X_train) // batch_size

for epoch in range(num_epochs):
    X_train, y_train = shuffle(X_train, y_train)
    total_loss = 0
    for iteration in range(num_iterations):
        start = iteration * batch_size
        end = (iteration + 1) * batch_size
        inputs = torch.Tensor(X_train[start:end])
        labels = torch.LongTensor(y_train[start:end])
        optimizer.zero_grad()
        outputs = titanic(inputs)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}: Loss = {total_loss / num_iterations:.4f}")

Epoch 5: Loss = 0.5194
Epoch 10: Loss = 0.4576
Epoch 15: Loss = 0.4323
Epoch 20: Loss = 0.4459
Epoch 25: Loss = 0.4518
Epoch 30: Loss = 0.4369
Epoch 35: Loss = 0.4476
Epoch 40: Loss = 0.4432
Epoch 45: Loss = 0.4334
Epoch 50: Loss = 0.4252


In [76]:
# 3.2 Calculer la précision de la prévision sur les données dev

titanic.eval()

with torch.no_grad():
    dev_inputs = torch.Tensor(X_dev)
    dev_labels = torch.Tensor(y_dev)
    dev_outputs = titanic(dev_inputs)
    _, predicted = torch.max(dev_outputs.data, 1)
    total = dev_labels.size(0)
    correct = (predicted == dev_labels).sum().item()
    accuracy = correct / total

print('Accuracy on dev set: {:.2f}%'.format(accuracy * 100))

Accuracy on dev set: 81.11%


In [92]:
# 3.3 Calculer prévisions sur les données de tests

titanic.eval()
X_test = test_clean.iloc[:, 1:].values

with torch.no_grad():
    inputs_test = torch.Tensor(X_test)
    outputs_test = titanic(inputs_test)
    _, predicted_test = torch.max(outputs_test, 1)

print(predicted_test)

tensor([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
        1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
        0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
        1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
        1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,

In [93]:
# 3.4 Générer le fichier résultat et l'envoyer sur kaggle. Quel est votre score et votre classement ?

result_df = pd.DataFrame({'PassengerId': test_clean['PassengerId'], 'Survived': predicted_test})
result_df.to_csv('result.csv', index=False)

# Score: 0.77990 / Classement 3428


In [106]:
# 3.5 Exécuter une cross-validation dans une boucle pour trouver les meilleures valeurs de learning rate, de keep_prob et de nombre d'epochs.

from sklearn.model_selection import KFold

learning_rates = [0.001, 0.01, 0.1]
keep_probs = [0.5, 0.7, 0.9]
num_epochs = [50, 100, 150]
num_folds = 5

best_accuracy = 0
best_params = {}

kf = KFold(n_splits=num_folds)

for lr in learning_rates:
    for kp in keep_probs:
        for epochs in num_epochs:
            accuracies = []

            for train_index, dev_index in kf.split(X_alltrain):
                X_train, X_dev = X_alltrain[train_index], X_alltrain[dev_index]
                y_train, y_dev = y_alltrain[train_index], y_alltrain[dev_index]

                model = TitanicModel(input_size, hidden_size, output_size, dropout_rate)
                criterion = nn.CrossEntropyLoss()
                optimizer = optim.Adam(model.parameters(), lr=lr)

                for epoch in range(epochs):
                    model.train()
                    optimizer.zero_grad()

                    outputs = model(torch.Tensor(X_train))
                    loss = criterion(outputs, torch.LongTensor(y_train).type(torch.long))
                    loss.backward()
                    optimizer.step()

                model.eval()
                with torch.no_grad():
                    outputs = model(torch.Tensor(X_dev))
                    _, predicted = torch.max(outputs, 1)
                    y_dev_tensor = torch.from_numpy(y_dev)
                    accuracy = torch.eq(predicted, torch.Tensor(y_dev)).sum().item() / len(y_dev)
                    accuracies.append(accuracy)

            avg_accuracy = sum(accuracies) / len(accuracies)
            print(f"Learning Rate: {lr}, Keep Prob: {kp}, Epochs: {epochs}, Accuracy: {avg_accuracy:.4f}")

            if avg_accuracy > best_accuracy:
                best_accuracy = avg_accuracy
                best_params['learning_rate'] = lr
                best_params['keep_prob'] = kp
                best_params['epochs'] = epochs

print("Best Parameters:")
print(best_params)
print("Best Accuracy:")
print(best_accuracy)


Learning Rate: 0.001, Keep Prob: 0.5, Epochs: 50, Accuracy: 0.7510
Learning Rate: 0.001, Keep Prob: 0.5, Epochs: 100, Accuracy: 0.7880
Learning Rate: 0.001, Keep Prob: 0.5, Epochs: 150, Accuracy: 0.8002
Learning Rate: 0.001, Keep Prob: 0.7, Epochs: 50, Accuracy: 0.7588
Learning Rate: 0.001, Keep Prob: 0.7, Epochs: 100, Accuracy: 0.7891
Learning Rate: 0.001, Keep Prob: 0.7, Epochs: 150, Accuracy: 0.7924
Learning Rate: 0.001, Keep Prob: 0.9, Epochs: 50, Accuracy: 0.7576
Learning Rate: 0.001, Keep Prob: 0.9, Epochs: 100, Accuracy: 0.7913
Learning Rate: 0.001, Keep Prob: 0.9, Epochs: 150, Accuracy: 0.8048
Learning Rate: 0.01, Keep Prob: 0.5, Epochs: 50, Accuracy: 0.7969
Learning Rate: 0.01, Keep Prob: 0.5, Epochs: 100, Accuracy: 0.8059
Learning Rate: 0.01, Keep Prob: 0.5, Epochs: 150, Accuracy: 0.8170
Learning Rate: 0.01, Keep Prob: 0.7, Epochs: 50, Accuracy: 0.8092
Learning Rate: 0.01, Keep Prob: 0.7, Epochs: 100, Accuracy: 0.8025
Learning Rate: 0.01, Keep Prob: 0.7, Epochs: 150, Accuracy