In [26]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset-train2/train.csv
/kaggle/input/dataset-test2/test.csv


In [27]:
import torch
from torch import Tensor
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.decomposition import PCA
from torch.utils.data import TensorDataset, DataLoader
from scipy.stats import uniform, randint, loguniform
import warnings
warnings.filterwarnings("ignore")

## Preprocessing

In [28]:
data_train = pd.read_csv("/kaggle/input/dataset-train2/train.csv")
data_test = pd.read_csv("/kaggle/input/dataset-test2/test.csv")

X_train = data_train.iloc[:, 6:]
X_test = data_test.iloc[:, 5:]

# Encodage One-Hot sur les colonnes catégorielles
classifiers_train = data_train.iloc[:, 1:4]
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore') 
data_train_oh = enc.fit_transform(classifiers_train)

# Appliquer la même transformation sur data_test
classifiers_test = data_test.iloc[:, 1:4]
data_test_oh = enc.transform(classifiers_test)

# Convertir les résultats en DataFrame
data_train_oh = pd.DataFrame(data_train_oh)
data_test_oh = pd.DataFrame(data_test_oh)

# Concaténer One-Hot avec les autres caractéristiques
X_new_train = pd.concat([data_train_oh, X_train], axis=1)
X_new_test = pd.concat([data_test_oh, X_test], axis=1)

X_new_train.columns = X_new_train.columns.astype(str)
X_new_test.columns = X_new_test.columns.astype(str)

y = data_train["PURITY"] / 100

# Diviser les données en train et validation
seed = 43
X_train, X_valid, y_train, y_valid = train_test_split(X_new_train, y, test_size=0.2, random_state=42)

# Standardisation des données (avant PCA)
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_valid_standardized = scaler.transform(X_valid)
X_test_standardized = scaler.transform(X_new_test)

# PCA sur les données standardisées
pca = PCA(n_components=42)  # 95% variance expliquée
X_train_pca = pca.fit_transform(X_train_standardized)
X_valid_pca = pca.transform(X_valid_standardized)
X_test_pca = pca.transform(X_test_standardized)

# Convertir en tensors PyTorch
X_train_tensor = torch.tensor(X_train_pca, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
X_valid_tensor = torch.tensor(X_valid_pca, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32).reshape(-1, 1)
X_test_tensor = torch.tensor(X_test_pca, dtype=torch.float32)

# Vérifier les tailles
print("X_train_tensor size:", X_train_tensor.size())
print("X_valid_tensor size:", X_valid_tensor.size())
print("X_test_tensor size:", X_test_tensor.size())


X_train_tensor size: torch.Size([1040, 16])
X_valid_tensor size: torch.Size([260, 16])
X_test_tensor size: torch.Size([608, 16])


## Neuron class

In [29]:
# Définir le modèle de réseau de neurones simple

class MyLoss(nn.Module):  # Hérite de nn.Module
    def __init__(self, reduction: str = 'mean'):
        super(MyLoss, self).__init__()
        self.reduction = reduction

    def forward(self, output: Tensor, target: Tensor) -> Tensor:
        # Calcul de la perte personnalisée
        loss = torch.maximum(torch.zeros_like(output), (output - target) - 0.05) + \
               torch.maximum(torch.zeros_like(target), (target - output) - 0.05)

        # Applique la réduction spécifiée
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:  # 'none'
            return loss

class FeedForwardNN(nn.Module):

    def __init__(self, input_size, lin_layer_sizes,outpout_size, lin_layer_dropouts, activation):

        super().__init__()

        if activation == 0:
            self.activation = nn.ReLU()
        elif activation == 1:
            self.activation = nn.SiLU()
        elif activation == 2:
            self.activation = nn.Tanh()
        elif activation == 3:
            self.activation = nn.LeakyReLU()

        current_input_size = input_size

        # Création des couches linéaires en fonction des tailles spécifiées
        self.lin_layers = nn.ModuleList()
        for layer_size in lin_layer_sizes:
            self.lin_layers.append(nn.Linear(current_input_size, layer_size))
            current_input_size = layer_size  # Mise à jour de current_input_size

        # Couche de sortie : doit correspondre à la sortie de la dernière couche linéaire
        self.outpout_layer = nn.Linear(lin_layer_sizes[-1], outpout_size)
        
        # Création des couches de Dropout
        self.dropout_layers = nn.ModuleList([nn.Dropout(rate) for rate in lin_layer_dropouts])
    
    def forward(self, x):

        for lin_layer, dropout_layer in zip(self.lin_layers, self.dropout_layers):
            x = lin_layer(x)
            x = self.activation(x)
            x = dropout_layer(x)
        x = self.outpout_layer(x)
        x = nn.Sigmoid()(x)

        return x

# Définir la classe NeuralNetRegressor

class NeuralNetRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, input_size, random_state, eta=0.001, max_epochs=100, batch=10, lin_layer_sizes = [50, 50],
                 outpout_size = 1, lin_layer_dropouts = [0.4, 0.4], activation = 0):

        self.input_size = input_size
        self.random_state = random_state
        self.eta = eta
        self.max_epochs = max_epochs
        self.batch = batch
        self.lin_layer_sizes = lin_layer_sizes
        self.outpout_size = outpout_size
        self.lin_layer_dropouts = lin_layer_dropouts
        self.activation = activation
        self.model = FeedForwardNN(input_size, lin_layer_sizes,outpout_size, lin_layer_dropouts, activation)
        self.criterion = nn.L1Loss()

    def fit(self, X, y, do_print=False):
        #optimizer = optim.Adam(self.model.parameters(), lr=self.eta, weight_decay=1e-4)
        optimizer = torch.optim.SGD(self.parameters(), lr=self.eta, momentum=0.9, weight_decay=1e-4)
        X_tensor = torch.tensor(X).clone().detach().float()
        y_tensor = torch.tensor(y).clone().detach().float()
        dataset = TensorDataset(X_tensor, y_tensor)
        dataloader = DataLoader(dataset, batch_size=self.batch, shuffle=True)
        self.model.train()

        # Training loop
        for epoch in range(self.max_epochs):
            epoch_loss = 0.0
            for batch_X, batch_y in dataloader:
                optimizer.zero_grad()  # Reset gradients
                outputs = self.model(batch_X)  # Forward pass
                loss = self.criterion(outputs, batch_y)  # Compute loss
                loss.backward()  # Backward pass
                optimizer.step()  # Update parameters
                epoch_loss += loss.item()

                del batch_X, batch_y
            
            if do_print:
                print(f"Epoch {epoch+1}/{self.max_epochs}, Loss: {epoch_loss / len(dataloader)}")

        return self

    def predict(self, X):

        self.model.eval()
        X_tensor = torch.tensor(X, dtype=torch.float32)
        with torch.no_grad():
            outputs = self.model(X_tensor).flatten()

        return outputs.numpy()    

    def parameters(self):
        return self.model.parameters()

## RandomSearch

In [30]:
# Initialiser le modèle

np.random.seed(seed)
torch.manual_seed(seed)
input_size = X_train_tensor.shape[1]
net = NeuralNetRegressor(input_size=input_size, random_state=seed)

# Définir les paramètres pour GridSearch
params_dist = {

    'eta': loguniform(1e-4, 1e-1),
    'max_epochs': randint(200, 500),
    'batch': randint(32, 70),
    'lin_layer_sizes': [[randint.rvs(32, 128) for _ in range(randint.rvs(1, 5))]],  
    'lin_layer_dropouts': [[uniform.rvs(0, 0.5) for _ in range(randint.rvs(1, 5))]],
    'activation': randint(0, 4),
}

# Initialiser RandomizedSearchCV
random_search = RandomizedSearchCV(net, params_dist, refit=True, cv=5, random_state=seed, scoring='neg_mean_absolute_error', verbose=0, n_iter = 10)

# Entraîner le modèle avec GridSearch
random_grid_result = random_search.fit(X_train_tensor, y_train_tensor)
nouveau_model = random_grid_result.best_estimator_

print("Best MSE: %f using %s" % (random_grid_result.best_score_, random_grid_result.best_params_))

Best MSE: -0.043757 using {'activation': 3, 'batch': 22, 'eta': 0.009960259174680096, 'lin_layer_dropouts': [0.05145737951022, 0.07438636288266487, 0.08606628077901546, 0.4055181959984358], 'lin_layer_sizes': [96], 'max_epochs': 491}


## Prédictions

In [31]:
y_pred = nouveau_model.predict(X_valid_tensor)
y_pred_train = nouveau_model.predict(X_train_tensor)
predictions = nouveau_model.predict(X_test_tensor)*100

# Vérifiez les sorties du modèle
print("y_pred contains NaN:", np.isnan(y_pred).any())
# Calculer la MSE
mse = np.mean(((y_pred - y_valid)*100)**2)
print("MSE :", mse)

# Calculer le t_score
train_score = np.mean(np.abs(y_pred_train-y_train)*100<=5)
test_score = np.mean(np.abs(y_pred-y_valid)*100<=5)
print("t_score test :", train_score)
print("t_score train :", test_score)

ids = np.arange(1, len(predictions) + 1)

# Create a DataFrame for the output
output_df = pd.DataFrame({

    'ID': ids,

    'PURITY': predictions

})

# Save the DataFrame to a CSV file
output_df.to_csv('predictions.csv', index=False) 

y_pred contains NaN: False
MSE : 38.15832206233432
t_score test : 0.7528846153846154
t_score train : 0.6923076923076923
