In [None]:
#Data with Sr-Nd and Major, minor and trace elements. 
#Training my model with data that was labeled based on 143Nd/144Nd versus 87Sr/86Sr values. My data has 143Nd/144Nd versus 87Sr/86Sr values but also some major, minor and trace elements. 

import pandas as pd
import sklearn

final_data=pd.read_csv('../DataFP/final_data.csv')
labels = final_data["labelSrNd"]
#Normalizing my data with a Gaussian distribution
normalized_final_data = (final_data - final_data.mean()) / (final_data.std())
normalized_final_data = normalized_final_data.fillna(0)

normalized_final_data["labelSrNd"] = labels


In [None]:
#Dividing my data set for training (80%), validation (20%) and test (20%)
from sklearn.model_selection import train_test_split

train_data, no_train_data = train_test_split(normalized_final_data, test_size=0.2, random_state=42)
train_data.to_csv('../DataFP/train_data.csv',index=False)
print(len(train_data))

val_data, test_data = train_test_split(no_train_data, test_size=0.5, random_state=42)
val_data.to_csv('../DataFP/val_data.csv',index=False)
print(len(val_data))

test_data.to_csv('../DataFP/test_data.csv',index=False)
print(len(test_data))

In [121]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader


class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data.values
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        #print(type(self.data))
        sample = self.data[idx]
        x = sample[1:] #all my features- chemical composition: Major and minor elements plus Sr and Nd isotopes
        y = sample[0] #mantle source label
        return x, y


datatrain = MyDataset(train_data)
train_loader = DataLoader(datatrain, batch_size=32, shuffle=True)
dataval=MyDataset(val_data)
val_loader= DataLoader(dataval, batch_size=32)
datatest=MyDataset(test_data)
test_loader=DataLoader(datatest, batch_size=32)




In [139]:
from sklearn.metrics import accuracy_score, f1_score


# Define your MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, output_size)


    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        #x=torch.softmax(x, dim=1)
        return x


#Hyperparameters
input_size = 19   #Major and minor elements plus Sr and Nd isotopes
hidden_size = 10 #I define it, should be between input and output
output_size = 5 #mantle source types: DM, HIMU, EMI. BSE, PREMA
learning_rate = 0.001
num_epochs = 100

# Instantiate your model
model = MLP(input_size, hidden_size, output_size)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#Start the loop for training and validation
for epoch in range(num_epochs):
    
    batch_predictions = []
    batch_true_labels = []
    train_loss=0.0
    
    # Training
    for i, (inputs, labels) in enumerate(train_loader):
        # Forward pass
        outputs = model(inputs.float())
        loss = criterion(outputs, labels.long())

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss+=loss.item()

        # Convert output probabilities to predicted class labels
        _, predicted = torch.max(outputs.data, 1)
        
        # Accumulating the batch predictions and true labels to lists per each 
        batch_predictions.extend(predicted.tolist())
        batch_true_labels.extend(labels.tolist())
   
    # Calculate metrics for each epoch
    train_acc= accuracy_score(batch_true_labels, batch_predictions)  #calculating the accuracy of all my batches from the accumulation of the batches predictions and true labels
    train_f1= f1_score(batch_true_labels, batch_predictions, average='weighted')
    train_loss/=i+1

    #_____________________________________________________________________________________________________________________________
    #Validation
    batch_val_predictions = []
    batch_val_true_labels = []
    val_loss=0.0
    
    for b, (inputs, labels) in enumerate(val_loader):
        with torch.no_grad():
            # Forward pass
            outputs = model(inputs.float())
            loss = criterion(outputs, labels.long())
            val_loss+=loss.item()
            
            # Convert output probabilities to predicted class labels
            _, predicted = torch.max(outputs.data, 1)

            # Accumulating the batch predictions and true labels to lists per each 
            batch_val_predictions.extend(predicted.tolist())
            batch_val_true_labels.extend(labels.tolist())
    
    # Calculate metrics for each epoch
    val_acc= accuracy_score(batch_val_true_labels, batch_val_predictions)  #calculating the accuracy of all my batches from the accumulation of the batches predictions and true labels
    val_f1 = f1_score(batch_val_true_labels, batch_val_predictions, average='weighted')
    val_loss/=b+1

    # Compute and print training loss every 10 epochs
    if (epoch+1) % 10 == 0:
        to_print = f'Epoch {epoch+1}/{num_epochs}: '
        to_print += f'Train Loss: {train_loss:.4f}, '
        to_print+=f'Train Acc: {train_acc:.4f}, '
        to_print+=f'Train f1: {train_f1:.4f}, '
        to_print+= f'Val Loss: {val_loss:.4f}, ' 
        to_print+=f'Val Acc: {val_acc:.4f}'
        to_print+=f'Val f1: {val_f1:.4f}, '
        print(to_print)

print('Training finished')


Epoch 10/100: Train Loss: 0.3920, Train Acc: 0.8758, Val Loss: 0.5844, Val Acc: 0.8901
Epoch 20/100: Train Loss: 0.2720, Train Acc: 0.9059, Val Loss: 0.7062, Val Acc: 0.8920
Epoch 30/100: Train Loss: 0.2387, Train Acc: 0.9153, Val Loss: 0.5745, Val Acc: 0.9108
Epoch 40/100: Train Loss: 0.1694, Train Acc: 0.9316, Val Loss: 0.4739, Val Acc: 0.9234
Epoch 50/100: Train Loss: 0.1178, Train Acc: 0.9601, Val Loss: 0.4582, Val Acc: 0.9579
Epoch 60/100: Train Loss: 0.0945, Train Acc: 0.9680, Val Loss: 0.4234, Val Acc: 0.9698
Epoch 70/100: Train Loss: 0.0828, Train Acc: 0.9706, Val Loss: 0.5017, Val Acc: 0.9391
Epoch 80/100: Train Loss: 0.0715, Train Acc: 0.9747, Val Loss: 0.4120, Val Acc: 0.9592
Epoch 90/100: Train Loss: 0.0739, Train Acc: 0.9730, Val Loss: 0.4213, Val Acc: 0.9611
Epoch 100/100: Train Loss: 0.0638, Train Acc: 0.9778, Val Loss: 0.4555, Val Acc: 0.9629
Training finished


In [148]:
#Test
from sklearn.metrics import classification_report, confusion_matrix

test_predictions = []
test_true_labels = []
test_loss=0.0

for c, (inputs, labels) in enumerate(test_loader):
    with torch.no_grad():
        # Forward pass
        outputs = model(inputs.float())
        loss = criterion(outputs, labels.long())
        test_loss+=loss.item()
        
        # Convert output probabilities to predicted class labels
        _, predicted = torch.max(outputs.data, 1)

        # Accumulating the batch predictions and true labels to lists per each 
        test_predictions.extend(predicted.tolist())
        test_true_labels.extend(labels.tolist())

test_acc= accuracy_score(test_true_labels,test_predictions)  
print(test_acc)
cm = confusion_matrix(test_true_labels, test_predictions)
report = classification_report(test_true_labels, test_predictions)
print(cm)
print(report)
cm_df = pd.DataFrame(cm, index=['DM', 'HIMU','EMI','BSE','PREMA'], columns=['DM_pred', 'HIMU_pred','EMI_pred','BSE_pred','PREMA_pred'])
cm_df.to_csv('../Results/confusion_matrix.csv')
with open('../Results/classification_report.txt', 'w') as f:
    f.write(report)



0.9585687382297552
[[145  15   9   0   5]
 [  0 355   0   0   7]
 [  5   0 790   2   0]
 [  0   0  12  54   0]
 [  8   3   0   0 183]]
              precision    recall  f1-score   support

         0.0       0.92      0.83      0.87       174
         1.0       0.95      0.98      0.97       362
         2.0       0.97      0.99      0.98       797
         3.0       0.96      0.82      0.89        66
         4.0       0.94      0.94      0.94       194

    accuracy                           0.96      1593
   macro avg       0.95      0.91      0.93      1593
weighted avg       0.96      0.96      0.96      1593

