# XGBoost with carbon, silicon, nitrogen, and aluminum isotopes  TEST NOTEBOOK

This notebook contains label encoding and then inverse, in other words: taking string data, converting it to numerical (for calculations), then converting it back to letters again.

In [21]:
# Imports
import joblib
import pandas as pd
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import LabelEncoder

from numpy import loadtxt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

## Read the CSV

In [13]:
# load data
#dataset = loadtxt('presolargrains_C_only.csv', delimiter=",")
# Read csv file
C_Si_N_Al = pd.read_csv('presolargrains_C_Si_N_Al.csv')
C_Si_N_Al.head()

Unnamed: 0,Type,carbon_12_13,nitrogen_14_15,aluminum_26_27,silicon_29_28,silicon_30_28
0,X,1581.0,116.0,0.0095,-684.0,-490.0
1,X,234.75,187.2,0.3327,-683.0,-501.0
2,X,140.0,97.0,0.017,-653.0,-446.0
3,X,223.0,102.0,0.114,-600.0,-459.0
4,X,1693.89,63.2,0.2364,-588.0,-605.0


## Train Test Split

In [15]:
# split data into X and y
X = C_Si_N_Al.loc[::1,'carbon_12_13':'silicon_30_28']
y = C_Si_N_Al['Type'].values.reshape(-1, 1)
print(X)

     carbon_12_13  nitrogen_14_15  aluminum_26_27  silicon_29_28  \
0         1581.00          116.00        0.009500        -684.00   
1          234.75          187.20        0.332700        -683.00   
2          140.00           97.00        0.017000        -653.00   
3          223.00          102.00        0.114000        -600.00   
4         1693.89           63.20        0.236400        -588.00   
5          216.59          111.72        0.000030        -556.53   
6         2377.19           18.50        0.435700        -555.00   
7          578.92          148.16        0.089000        -551.00   
8         3101.02           28.20        0.437000        -521.00   
9            6.48           15.77        0.390000        -489.00   
10         552.00           58.00        0.204900        -473.30   
11         304.00          106.00        0.230000        -459.00   
12         391.00           39.00        0.167000        -452.00   
13        1528.00           46.00        0.06600

In [25]:
# Label Encoder and Transform
label_encoder = LabelEncoder()

data = y.copy()

label_encoder.fit(data)
encoded_y = label_encoder.transform(data)
label_encoder.classes_
encoded_y


array([4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 4, 6, 6, 2,
       2, 0, 2, 0, 2, 0, 0, 2, 2, 4, 2, 0, 2, 0, 0, 5, 2, 2, 6, 2, 2, 2,
       0, 5, 5, 0, 2, 0, 2, 0, 2, 0, 0, 6, 5, 0, 2, 2, 0, 5, 2, 0, 0, 0,
       0, 0, 0, 0, 5, 2, 2, 2, 0, 2, 0, 0, 0, 2, 2, 2, 0, 0, 5, 0, 5, 5,
       0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 5, 2, 2, 5, 2, 0, 0, 0, 2, 2, 0, 2,
       5, 0, 0, 5, 5, 2, 2, 5, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0,
       0, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0,
       0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2,
       2, 2, 3, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 0, 5, 2, 2, 5, 2,
       2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2,
       2, 0, 0, 5, 0, 0, 2, 2, 0, 2, 2, 2, 6, 2, 2,

In [26]:
# Split data into train and test sets
#seed = 1
#test_size = 0.33
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=1, stratify=y)

## Fit model

In [27]:
# fit model
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=None, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)

## Print model

In [28]:
# Print model
print(model)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=None, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)


## Compare predictions to actual

In [29]:
# Make predictions for test data
y_pred = model.predict(X_test)
# Create and print dataframe with predicted and actual types
pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,2,2
2,2,2
3,4,4
4,2,2
5,5,5
6,2,2
7,2,2
8,4,4
9,0,0


## Report Accuracy

In [30]:
# Evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 95.12%


In [9]:
# Not saving this model. There are so few records (328) when using all these elements and removing empty cells
#filename = 'XGBoost_Si.sav'
#joblib.dump(model, filename)


## Summary: 

### XGBoost with carbon, silicon, nitrogen, and aluminum isotopes: 95.1%

In [35]:
# Convert numbers back to letters
inverse = label_encoder.inverse_transform(y_test)
inverse
inverse_pred = label_encoder.inverse_transform(y_pred)

In [36]:
# Create and print dataframe with predicted and actual types
pd.DataFrame({"Prediction": inverse_pred, "Actual": inverse}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,AB,AB
1,M,M
2,M,M
3,X,X
4,M,M
5,Y,Y
6,M,M
7,M,M
8,X,X
9,AB,AB
