In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.decomposition import PCA
from torch.utils.data import TensorDataset, DataLoader
from scipy.stats import uniform, randint, loguniform
from sklearn import preprocessing

import warnings
warnings.filterwarnings("ignore")

In [7]:
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

X_train = data_train.iloc[:, 6:]
X_test = data_test.iloc[:, 5:]

In [10]:
y = data_train["PURITY"]/100
seed = 43
"""
# Combine original data with PCA components
standardizer = StandardScaler()
X_st = standardizer.fit_transform(X_train)
"""
# Perform PCA to retain 95% of the variance
pca = PCA(42)
pca.fit(X_train)
X_pca = pca.transform(X_train)

X_normalised = preprocessing.normalize(X_pca)

X_train, X_valid, y_train, y_valid = train_test_split(X_normalised, y, test_size=0.2, random_state = 42)

# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).clone().detach()
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1).clone().detach()
X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32).clone().detach()
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32).reshape(-1, 1).clone().detach()

In [11]:
X_pca = pca.transform(X_test)
X_test_normalised = preprocessing.normalize(X_pca)
X_test_tensor = torch.tensor(X_test_normalised, dtype=torch.float32).clone().detach() 

In [8]:
y = data_train["PURITY"] / 100

# Diviser les données en train et validation

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y, test_size=0.2, random_state=42)

# Standardisation des données (avant PCA)
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_valid_standardized = scaler.transform(X_valid)
X_test_standardized = scaler.transform(X_test)

# PCA sur les données standardisées
pca = PCA(n_components=42)  # 95% variance expliquée
X_train_pca = pca.fit_transform(X_train_standardized)
X_valid_pca = pca.transform(X_valid_standardized)
X_test_pca = pca.transform(X_test_standardized)

# Convertir en tensors PyTorch
X_train_tensor = torch.tensor(X_train_pca, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
X_valid_tensor = torch.tensor(X_valid_pca, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32).reshape(-1, 1)
X_test_tensor = torch.tensor(X_test_pca, dtype=torch.float32)


In [9]:
# Définir le modèle de réseau de neurones simple

class FeedForwardNN(nn.Module):

    def __init__(self, input_size, lin_layer_sizes,

                 outpout_size, lin_layer_dropouts, activation):
        super().__init__()       
        if activation == 0:
            self.activation = nn.ReLU()
        elif activation == 1:
            self.activation = nn.SiLU()
        elif activation == 2:
            self.activation = nn.Tanh()
        elif activation == 3:
            self.activation = nn.LeakyReLU()  

        # Linear Layers
        first_lin_layer = nn.Linear(input_size, lin_layer_sizes[0])
        self.lin_layers = nn.ModuleList([first_lin_layer] + [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1]) for i in range(len(lin_layer_sizes) - 1)])     

        # Output Layer
        self.outpout_layer = nn.Linear(lin_layer_sizes[-1], outpout_size)
        
        # Dropout Layers
        self.dropout_layers = nn.ModuleList([nn.Dropout(rate) for rate,size in zip(lin_layer_dropouts,lin_layer_sizes)])

    def forward(self, x):
        for lin_layer, dropout_layer in zip(self.lin_layers, self.dropout_layers):
            x = lin_layer(x)     
            x = self.activation(x)
            x = dropout_layer(x)
 
        x = self.outpout_layer(x)
        x = nn.Sigmoid()(x)
  
        return x

# Définir la classe NeuralNetRegressor
class NeuralNetRegressor(BaseEstimator, RegressorMixin):

    def __init__(self, input_size, random_state, eta=0.001, max_epochs=100, batch=10, lin_layer_sizes = [50, 50],
            outpout_size = 1, lin_layer_dropouts = [0.4, 0.4], activation = 0):

        self.input_size = input_size
        self.random_state = random_state
        self.eta = eta
        self.max_epochs = max_epochs
        self.batch = batch
        self.lin_layer_sizes = lin_layer_sizes
        self.outpout_size = outpout_size
        self.lin_layer_dropouts = lin_layer_dropouts
        self.activation = activation
        self.model = FeedForwardNN(input_size, lin_layer_sizes,
                 outpout_size, lin_layer_dropouts, activation)

        self.criterion = nn.MSELoss()
    
    def fit(self, X, y, do_print=False):

        optimizer = optim.Adam(self.model.parameters(), lr=self.eta)
        X_tensor = torch.tensor(X).clone().detach().float()
        y_tensor = torch.tensor(y).clone().detach().float()
        dataset = TensorDataset(X_tensor, y_tensor)
        dataloader = DataLoader(dataset, batch_size=self.batch, shuffle=True)
        self.model.train()

        # Training loop
        for epoch in range(self.max_epochs):

            epoch_loss = 0.0
            for batch_X, batch_y in dataloader:
                optimizer.zero_grad()  # Reset gradients
                outputs = self.model(batch_X)  # Forward pass
                loss = self.criterion(outputs, batch_y)  # Compute loss
                loss.backward()  # Backward pass
                optimizer.step()  # Update parameters
                epoch_loss += loss.item()
            if do_print:

                print(f"Epoch {epoch+1}/{self.max_epochs}, Loss: {epoch_loss / len(dataloader)}")

        return self  

    def predict(self, X):
        self.model.eval()
        X_tensor = torch.tensor(X, dtype=torch.float32)
        with torch.no_grad():
            outputs = self.model(X_tensor).flatten()
        return outputs.numpy()
        
    def parameters(self):
        return self.model.parameters()

In [11]:
class RandomLayers:
    def __init__(self,min_layers, max_layers, min_nodes, max_nodes):
        self.min_nodes = min_nodes
        self.max_nodes = max_nodes
        self.min_layers = min_layers
        self.max_layers = max_layers

    def rvs(self, random_state=None):
        if random_state is not None:
            prev = randint.random_state
            randint.random_state = random_state 

        res = [randint.rvs(self.min_nodes, self.max_nodes) for _ in range(randint.rvs(self.min_layers, self.max_layers))]

        if random_state is not None:
            randint.random_state = prev

        return res

In [12]:
class RandomDropout:
    def __init__(self, max_layers,min_dropout,max_dropout):
        self.min_dropout = min_dropout
        self.max_dropout = max_dropout
        self.max_layers = max_layers

    def rvs(self, random_state=None):
        if random_state is not None:
            prev = uniform.random_state
            uniform.random_state = random_state 
        res = [uniform.rvs(self.min_dropout, self.max_dropout) for _ in range(self.max_layers)]

        if random_state is not None:
            uniform.random_state = prev

        return res

In [14]:
# Initialiser le modèle

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
input_size = X_train_tensor.shape[1]
net = NeuralNetRegressor(input_size=input_size, random_state=seed)

max_layers = 5

# Définir les paramètres pour GridSearch
params_dist = {

    'eta': loguniform(1e-4, 1e-1),
    'max_epochs': randint(200, 500),
    'batch': randint(8, 32),
    'lin_layer_sizes': RandomLayers(1,max_layers,32,128),  # Taille de 1 à 4 couches, entre 32 et 128 neurones par couche
    'lin_layer_dropouts': RandomDropout(max_layers,0.,1.),
    'activation': randint(0, 4),
}

# Initialiser RandomizedSearchCV
random_search = RandomizedSearchCV(net, params_dist, refit=True, cv=5, random_state=seed, scoring='neg_mean_squared_error', verbose=3, n_iter=10)

# Entraîner le modèle avec GridSearch
random_grid_result = random_search.fit(X_train_tensor, y_train_tensor)
nouveau_model = random_grid_result.best_estimator_
print("Best MSE: %f using %s" % (random_grid_result.best_score_, random_grid_result.best_params_))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


KeyboardInterrupt: 

In [24]:
y_pred = nouveau_model.predict(X_valid_tensor)
y_pred_train = nouveau_model.predict(X_train_tensor)
predictions = nouveau_model.predict(X_test_tensor)*100

# Vérifiez les sorties du modèle
print("y_pred contains NaN:", np.isnan(y_pred).any())
# Calculer la MSE
mse = np.mean(((y_pred - y_valid)*100)**2)
print("MSE :", mse)

# Calculer le t_score
train_score = np.mean(np.abs(y_pred_train-y_train)*100<=5)
test_score = np.mean(np.abs(y_pred-y_valid)*100<=5)
print("t_score test :", train_score)
print("t_score train :", test_score)

ids = np.arange(1, len(predictions) + 1)

# Create a DataFrame for the output
output_df = pd.DataFrame({

    'ID': ids,

    'PURITY': predictions

})

# Save the DataFrame to a CSV file
output_df.to_csv('predictions.csv', index=False)

y_pred contains NaN: False
MSE : 17.89531156494539
t_score test : 0.9259615384615385
t_score train : 0.85
