In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/train-csv/train.csv
/kaggle/input/dataset-test/test.csv


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.decomposition import PCA
from torch.utils.data import TensorDataset, DataLoader
from scipy.stats import uniform, randint, loguniform

In [3]:
data = pd.read_csv("/kaggle/input/train-csv/train.csv")
data_test = pd.read_csv("/kaggle/input/dataset-test/test.csv")
X_train = data.iloc[:, 6:]
X_test = data_test.iloc[:, 5:]


In [4]:
#spectrum_filtered = pd.DataFrame(savgol_filter(spectrum, 7, 3, deriv = 2, axis = 0))
#spectrum_filtered_st = zscore(spectrum_filtered, axis = 1)

y = data["PURITY"]/100

pca = PCA(n_components=16)
X_pca = pd.DataFrame(pca.fit_transform(X_train))
X_new = pd.concat([X_train, X_pca], axis=1)
X_new.columns = X_new.columns.astype(str)
standardizer = StandardScaler()
X_st = standardizer.fit_transform(X_new)

X_train, X_valid, y_train, y_valid = train_test_split(X_st, y, test_size=0.2, random_state=42)

# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32).reshape(-1, 1)

In [5]:
X_pca = pd.DataFrame(pca.transform(X_test))
X_new = pd.concat([X_test, X_pca], axis=1)
X_new.columns = X_new.columns.astype(str)

X_st = standardizer.transform(X_new)
X_test_tensor = torch.tensor(X_st, dtype=torch.float32)

In [6]:
# Définir le modèle de réseau de neurones simple
class FeedForwardNN(nn.Module):

    def __init__(self, input_size, lin_layer_sizes,
                 outpout_size, lin_layer_dropouts, activation):
        
        super().__init__()
        
        if activation == 0:
            self.activation = nn.ReLU()
        elif activation == 1:
            self.activation = nn.SiLU()
        elif activation == 2:
            self.activation = nn.Tanh()
        elif activation == 3:
            self.activation = nn.LeakyReLU()

    
        # Linear Layers
        first_lin_layer = nn.Linear(input_size, lin_layer_sizes[0])
    
        self.lin_layers = nn.ModuleList([first_lin_layer] + [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1]) for i in range(len(lin_layer_sizes) - 1)])
        
        #for lin_layer in self.lin_layers:
            #nn.init.kaiming_normal_(lin_layer.weight.data)
      
        # Output Layer
        self.outpout_layer = nn.Linear(lin_layer_sizes[-1], outpout_size)
        #nn.init.kaiming_normal_(self.outpout_layer.weight.data)
    
    
        # Dropout Layers
        self.dropout_layers = nn.ModuleList([nn.Dropout(rate) for rate,size in zip(lin_layer_dropouts,lin_layer_sizes)])

    def forward(self, x):
  
        for lin_layer, dropout_layer in zip(self.lin_layers, self.dropout_layers):

            x = lin_layer(x)
        
            x = self.activation(x)

            x = dropout_layer(x)
      
        x = self.outpout_layer(x)
        x = nn.Sigmoid()(x)
    
        return x

# Définir la classe NeuralNetRegressor
class NeuralNetRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, input_size, random_state, eta=0.001, max_epochs=100, batch=10, lin_layer_sizes = [50, 50],
                 outpout_size = 1, lin_layer_dropouts = [0.4, 0.4], activation = 0):
        self.input_size = input_size
        self.random_state = random_state
        self.eta = eta
        self.max_epochs = max_epochs
        self.batch = batch
        self.lin_layer_sizes = lin_layer_sizes
        self.outpout_size = outpout_size
        self.lin_layer_dropouts = lin_layer_dropouts
        self.activation = activation
        self.model = FeedForwardNN(input_size, lin_layer_sizes,
                 outpout_size, lin_layer_dropouts, activation)
        self.criterion = nn.MSELoss()
    
    def fit(self, X, y, do_print=False):
        optimizer = optim.Adam(self.model.parameters(), lr=self.eta)
        X_tensor = torch.tensor(X).clone().detach().float()
        y_tensor = torch.tensor(y).clone().detach().float()
        dataset = TensorDataset(X_tensor, y_tensor)
        dataloader = DataLoader(dataset, batch_size=self.batch, shuffle=True)
        self.model.train()
        # Training loop
        for epoch in range(self.max_epochs):
            epoch_loss = 0.0
            for batch_X, batch_y in dataloader:
                optimizer.zero_grad()  # Reset gradients
                outputs = self.model(batch_X)  # Forward pass
                loss = self.criterion(outputs, batch_y)  # Compute loss
                loss.backward()  # Backward pass
                optimizer.step()  # Update parameters
                epoch_loss += loss.item()
            if do_print:
                print(f"Epoch {epoch+1}/{self.max_epochs}, Loss: {epoch_loss / len(dataloader)}")
        return self
    
    def predict(self, X):
        self.model.eval()
        X_tensor = torch.tensor(X, dtype=torch.float32)
        with torch.no_grad():
            outputs = self.model(X_tensor).flatten()
        return outputs.numpy()
    
    def parameters(self):
        return self.model.parameters()

In [7]:
# Initialiser le modèle
seed = 43
np.random.seed(seed)
torch.manual_seed(seed)

input_size = X_train_tensor.shape[1]
net = NeuralNetRegressor(input_size=input_size, random_state=43)

# Définir les paramètres pour GridSearch
params_dist = {
    'eta': loguniform(1e-4, 1e-1),
    'max_epochs': randint(50, 150),
    'batch': randint(32, 70),
    'lin_layer_sizes': [[randint.rvs(32, 128) for _ in range(randint.rvs(1, 4))]],  # Taille de 1 à 4 couches, entre 32 et 128 neurones par couche
    'lin_layer_dropouts': [[uniform.rvs(0, 0.5) for _ in range(randint.rvs(1, 4))]],  # Dropout entre 0 et 0.5 pour chaque couche
    'activation': randint(0, 4),
}

# Initialiser RandomizedSearchCV
random_search = RandomizedSearchCV(net, params_dist, refit=True, cv=5, random_state=43, scoring='neg_mean_squared_error', verbose=0)

# Entraîner le modèle avec GridSearch
random_grid_result = random_search.fit(X_train_tensor, y_train_tensor)
nouveau_model = random_grid_result.best_estimator_

print("Best MSE: %f using %s" % (random_grid_result.best_score_, random_grid_result.best_params_))

  X_tensor = torch.tensor(X).clone().detach().float()
  y_tensor = torch.tensor(y).clone().detach().float()
  X_tensor = torch.tensor(X, dtype=torch.float32)
  X_tensor = torch.tensor(X).clone().detach().float()
  y_tensor = torch.tensor(y).clone().detach().float()
  X_tensor = torch.tensor(X, dtype=torch.float32)
  X_tensor = torch.tensor(X).clone().detach().float()
  y_tensor = torch.tensor(y).clone().detach().float()
  X_tensor = torch.tensor(X, dtype=torch.float32)
  X_tensor = torch.tensor(X).clone().detach().float()
  y_tensor = torch.tensor(y).clone().detach().float()
  X_tensor = torch.tensor(X, dtype=torch.float32)
  X_tensor = torch.tensor(X).clone().detach().float()
  y_tensor = torch.tensor(y).clone().detach().float()
  X_tensor = torch.tensor(X, dtype=torch.float32)
  X_tensor = torch.tensor(X).clone().detach().float()
  y_tensor = torch.tensor(y).clone().detach().float()
  X_tensor = torch.tensor(X, dtype=torch.float32)
  X_tensor = torch.tensor(X).clone().detach().float(

Best MSE: -0.001374 using {'activation': 0, 'batch': 44, 'eta': 0.0008877474082252488, 'lin_layer_dropouts': [0.06669548209299414, 0.12029480998267439], 'lin_layer_sizes': [96], 'max_epochs': 97}


In [8]:
y_pred = nouveau_model.predict(X_valid_tensor)
y_pred_train = nouveau_model.predict(X_train_tensor)
predictions = nouveau_model.predict(X_test_tensor)*100

# Vérifiez les sorties du modèle
print("y_pred contains NaN:", np.isnan(y_pred).any())

# Calculer la MSE
mse = np.mean(((y_pred - y_valid)*100)**2)
print("MSE :", mse)

# Calculer le t_score
train_score = np.mean(np.abs((y_pred_train-y_train)*100<=5))
test_score = np.mean(np.abs((y_pred-y_valid)*100<=5))
print("t_score test :", train_score)
print("t_score train :", test_score)

ids = np.arange(1, len(predictions) + 1)

# Create a DataFrame for the output
output_df = pd.DataFrame({
    'ID': ids,
    'PURITY': predictions
})

# Save the DataFrame to a CSV file
output_df.to_csv('predictions.csv', index=False)

y_pred contains NaN: False
MSE : 13.581837518917599
t_score test : 0.9673076923076923
t_score train : 0.9461538461538461


  X_tensor = torch.tensor(X, dtype=torch.float32)


In [9]:
y_pred

array([0.24029233, 0.36736083, 0.23973517, 0.28944698, 0.44281128,
       0.20794278, 0.20111245, 0.08866518, 0.19674653, 0.2020213 ,
       0.5335545 , 0.15061939, 0.16805309, 0.41468725, 0.11493612,
       0.4410217 , 0.52767915, 0.24338959, 0.2929284 , 0.18531315,
       0.18137062, 0.44028443, 0.21507297, 0.20732561, 0.11325336,
       0.12829323, 0.22798647, 0.1405351 , 0.18367869, 0.21559575,
       0.13566755, 0.52683574, 0.56059074, 0.5581551 , 0.5147138 ,
       0.31851846, 0.21192876, 0.14519423, 0.22513808, 0.23884729,
       0.17797928, 0.31730333, 0.3885626 , 0.20454884, 0.20779203,
       0.233489  , 0.16629846, 0.1405557 , 0.16310664, 0.56903446,
       0.16252631, 0.47592476, 0.48533267, 0.1154668 , 0.16940038,
       0.38872036, 0.14117686, 0.12515198, 0.46981642, 0.2936842 ,
       0.06529549, 0.04379864, 0.09060483, 0.55798274, 0.23972459,
       0.201648  , 0.19631031, 0.5403446 , 0.4810887 , 0.5815537 ,
       0.2170316 , 0.3939817 , 0.04245107, 0.33779395, 0.13909