In [119]:
#Data with Sr-Nd and Major, minor and trace elements. 
#Training my model with data that was labeled based on 143Nd/144Nd versus 87Sr/86Sr values. My data has 143Nd/144Nd versus 87Sr/86Sr values but also some major, minor and trace elements. 

import pandas as pd
import sklearn

final_data=pd.read_csv('../DataFP/final_data.csv')
print(final_data)

       labelSrNd   latitude   longitude  rb87_sr86  sr87_sr86  nd143_nd144   
0              2  33.686551  130.304669     0.6262   0.705200     0.512554  \
1              2  33.687129  130.304404     0.5366   0.705170     0.512661   
2              2  33.688168  130.301608     0.5165   0.705010     0.512694   
3              2  36.988056  138.505000        NaN   0.703871     0.512919   
4              2  43.076423  141.635239        NaN   0.705267     0.512626   
...          ...        ...         ...        ...        ...          ...   
15916          1        NaN         NaN        NaN   0.703183     0.513075   
15917          1        NaN         NaN        NaN   0.703204     0.513044   
15918          1        NaN         NaN        NaN   0.703140     0.513064   
15919          1        NaN         NaN        NaN   0.703195     0.513038   
15920          1        NaN         NaN        NaN   0.703199     0.513070   

       sm147_nd144  pb206_pb204  pb207_pb204  pb208_pb204  ... 

In [131]:
labels_SrNd = final_data['labelSrNd']
lat = final_data['latitude']
long = final_data['longitude']

min_latitude = -90
max_latitude = 90
min_longitude = -180
max_longitude = 180

# Normalize latitude and longitude values using min-max scaling
lat = (lat - min_latitude) / (max_latitude - min_latitude)
long = (long - min_longitude) / (max_longitude - min_longitude)

#Normalizing my data (chemical composition) with a Gaussian distribution 
normalized_final_data = (final_data - final_data.mean()) / (final_data.std())

normalized_final_data['labelSrNd'] = labels_SrNd
normalized_final_data['latitude'] = lat
normalized_final_data['longitude'] = long
normalized_final_data = normalized_final_data.fillna(0)

#print(normalized_final_data)


In [132]:
drop_first_col=True #True for only major, minor and trace. #False when including the isotopes Sr and Nd that I used for labeling the data
if drop_first_col:
    normalized_final_data= normalized_final_data.drop(columns=['sr87_sr86','nd143_nd144'])


print(normalized_final_data)

       labelSrNd  latitude  longitude  rb87_sr86  sm147_nd144  pb206_pb204   
0              2  0.687148   0.861957  -0.000867    -0.261708     0.000000  \
1              2  0.687151   0.861957  -0.004441     0.032258     0.000000   
2              2  0.687156   0.861949  -0.005243    -0.220730     0.000000   
3              2  0.705489   0.884736   0.000000     0.000000     0.000000   
4              2  0.739313   0.893431   0.000000     0.000000     0.000000   
...          ...       ...        ...        ...          ...          ...   
15916          1  0.000000   0.000000   0.000000     0.000000    -0.064399   
15917          1  0.000000   0.000000   0.000000     0.000000    -0.063284   
15918          1  0.000000   0.000000   0.000000     0.000000    -0.066631   
15919          1  0.000000   0.000000   0.000000     0.000000    -0.064081   
15920          1  0.000000   0.000000   0.000000     0.000000    -0.066710   

       pb207_pb204  pb208_pb204      sio2      tio2  ...    ta_

In [133]:
#Dividing my data set for training (80%), validation (20%) and test (20%)
from sklearn.model_selection import train_test_split

train_data, no_train_data = train_test_split(normalized_final_data, test_size=0.2, random_state=42)
#train_data.to_csv('../DataFP/train_data.csv',index=False)
print(len(train_data))

val_data, test_data = train_test_split(no_train_data, test_size=0.5, random_state=42)
#val_data.to_csv('../DataFP/val_data.csv',index=False)
print(len(val_data))

#test_data.to_csv('../DataFP/test_data.csv',index=False)
print(len(test_data))

12736
1592
1593


In [134]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader


class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data.values
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        #print(type(self.data))
        sample = self.data[idx]
        x = sample[1:] #all my features- chemical composition: Major and minor elements plus Sr and Nd isotopes
        y = sample[0] #mantle source label
        return x, y


datatrain = MyDataset(train_data)
train_loader = DataLoader(datatrain, batch_size=32, shuffle=True)
dataval=MyDataset(val_data)
val_loader= DataLoader(dataval, batch_size=32)
datatest=MyDataset(test_data)
test_loader=DataLoader(datatest, batch_size=32)




In [137]:
from sklearn.metrics import accuracy_score, f1_score
import torch.nn.init as init
import torch.optim as optim



# Define your MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.dropout = nn.Dropout(p=0.1)  # Add dropout layer with dropout probability of 0.1
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, output_size)

         # Initialize the weights
        init.xavier_uniform_(self.fc1.weight)
        init.xavier_uniform_(self.fc2.weight)
        init.xavier_uniform_(self.fc3.weight)
        init.xavier_uniform_(self.fc4.weight)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)  # Apply dropout layer
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)  # Apply dropout layer
        #x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        x=torch.softmax(x, dim=1)
        return x


#Hyperparameters
input_size = 53  #Number of features: Major and minor elements plus Sr and Nd isotopes
hidden_size = 25 #I define it, should be between input and output
output_size = 5 #mantle source types: DM, HIMU, EMI. BSE, PREMA
learning_rate = 0.001
num_epochs = 500

# Instantiate your model
model = MLP(input_size, hidden_size, output_size)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)#,weight_decay=1e-5)


#Start the loop for training and validation
for epoch in range(num_epochs):
    
    batch_predictions = []
    batch_true_labels = []
    train_loss=0.0
    
    # Training
    for i, (inputs, labels) in enumerate(train_loader):
        # Forward pass
        outputs = model(inputs.float())
        loss = criterion(outputs, labels.long())

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss+=loss.item()

        # Convert output probabilities to predicted class labels
        _, predicted = torch.max(outputs.data, 1)
        
        # Accumulating the batch predictions and true labels to lists per each 
        batch_predictions.extend(predicted.tolist())
        batch_true_labels.extend(labels.tolist())
   
    # Calculate metrics for each epoch
    train_acc= accuracy_score(batch_true_labels, batch_predictions)  #calculating the accuracy of all my batches from the accumulation of the batches predictions and true labels
    train_f1= f1_score(batch_true_labels, batch_predictions, average='weighted')
    train_loss/=i+1

    #_____________________________________________________________________________________________________________________________
    #Validation
    batch_val_predictions = []
    batch_val_true_labels = []
    val_loss=0.0
    
    for b, (inputs, labels) in enumerate(val_loader):
        with torch.no_grad():
            # Forward pass
            outputs = model(inputs.float())
            loss = criterion(outputs, labels.long())
            val_loss+=loss.item()
            
            # Convert output probabilities to predicted class labels
            _, predicted = torch.max(outputs.data, 1)

            # Accumulating the batch predictions and true labels to lists per each 
            batch_val_predictions.extend(predicted.tolist())
            batch_val_true_labels.extend(labels.tolist())
    
    # Calculate metrics for each epoch
    val_acc= accuracy_score(batch_val_true_labels, batch_val_predictions)  #calculating the accuracy of all my batches from the accumulation of the batches predictions and true labels
    val_f1 = f1_score(batch_val_true_labels, batch_val_predictions, average='weighted')
    val_loss/=b+1

    # Compute and print training loss every 10 epochs
    if (epoch+1) % 10 == 0:
        to_print = f'Epoch {epoch+1}/{num_epochs}: '
        to_print += f'Train Loss: {train_loss:.4f}, '
        to_print+=f'Train Acc: {train_acc:.4f}, '
        to_print+=f'Train f1: {train_f1:.4f}, '
        to_print+= f'Val Loss: {val_loss:.4f}, ' 
        to_print+=f'Val Acc: {val_acc:.4f}, '
        to_print+=f'Val f1: {val_f1:.4f}'
        print(to_print)

print('Training finished')


Epoch 10/500: Train Loss: 1.2829, Train Acc: 0.6194, Train f1: 0.5454, Val Loss: 1.2676, Val Acc: 0.6382, Val f1: 0.5704
Epoch 20/500: Train Loss: 1.2565, Train Acc: 0.6483, Train f1: 0.5806, Val Loss: 1.2549, Val Acc: 0.6501, Val f1: 0.5831
Epoch 30/500: Train Loss: 1.2435, Train Acc: 0.6594, Train f1: 0.5931, Val Loss: 1.2500, Val Acc: 0.6501, Val f1: 0.5858
Epoch 40/500: Train Loss: 1.2362, Train Acc: 0.6672, Train f1: 0.6021, Val Loss: 1.2490, Val Acc: 0.6482, Val f1: 0.5864
Epoch 50/500: Train Loss: 1.2297, Train Acc: 0.6736, Train f1: 0.6100, Val Loss: 1.2449, Val Acc: 0.6533, Val f1: 0.5912
Epoch 60/500: Train Loss: 1.2224, Train Acc: 0.6810, Train f1: 0.6171, Val Loss: 1.2354, Val Acc: 0.6633, Val f1: 0.6031
Epoch 70/500: Train Loss: 1.2201, Train Acc: 0.6838, Train f1: 0.6221, Val Loss: 1.2374, Val Acc: 0.6633, Val f1: 0.6019
Epoch 80/500: Train Loss: 1.2122, Train Acc: 0.6918, Train f1: 0.6292, Val Loss: 1.2404, Val Acc: 0.6570, Val f1: 0.5934
Epoch 90/500: Train Loss: 1.2124

In [138]:
#Test
from sklearn.metrics import classification_report, confusion_matrix

test_predictions = []
test_true_labels = []
test_loss=0.0

for c, (inputs, labels) in enumerate(test_loader):
    with torch.no_grad():
        # Forward pass
        outputs = model(inputs.float())
        loss = criterion(outputs, labels.long())
        test_loss+=loss.item()
        
        # Convert output probabilities to predicted class labels
        _, predicted = torch.max(outputs.data, 1)

        # Accumulating the batch predictions and true labels to lists per each 
        test_predictions.extend(predicted.tolist())
        test_true_labels.extend(labels.tolist())

test_acc= accuracy_score(test_true_labels,test_predictions)  
print(test_acc)
cm = confusion_matrix(test_true_labels, test_predictions)
report = classification_report(test_true_labels, test_predictions)
print(cm)
print(report)
cm_df = pd.DataFrame(cm, index=['DM', 'HIMU','EMI','BSE','PREMA'], columns=['DM_pred', 'HIMU_pred','EMI_pred','BSE_pred','PREMA_pred'])
cm_df.to_csv('../Results/confusion_matrix_500.csv')
with open('../Results/classification_report_500.txt', 'w') as f:
    f.write(report)
print(cm_df)



0.6911487758945386
[[  0  37  98   0  27]
 [  0 217  80   0  20]
 [  0  34 787   0  33]
 [  0   0  55   0   0]
 [  0  36  72   0  97]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       162
         1.0       0.67      0.68      0.68       317
         2.0       0.72      0.92      0.81       854
         3.0       0.00      0.00      0.00        55
         4.0       0.55      0.47      0.51       205

    accuracy                           0.69      1593
   macro avg       0.39      0.42      0.40      1593
weighted avg       0.59      0.69      0.63      1593

       DM_pred  HIMU_pred  EMI_pred  BSE_pred  PREMA_pred
DM           0         37        98         0          27
HIMU         0        217        80         0          20
EMI          0         34       787         0          33
BSE          0          0        55         0           0
PREMA        0         36        72         0          97


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#Data with Sr-Nd and Major, minor and trace elements. 
#Training my model with data that was labeled based on 143Nd/144Nd versus 87Sr/86Sr values. My data has 143Nd/144Nd versus 87Sr/86Sr values but also some major, minor and trace elements. 
