# XGBoost with carbon isotopes

In [77]:
# Imports
import joblib
import pandas as pd

from numpy import loadtxt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

## Read the CSV

In [78]:
# load data
#dataset = loadtxt('presolargrains_C_only.csv', delimiter=",")
# Read csv file
C = pd.read_csv('presolargrains_carbon_letters.csv')
C.head()

Unnamed: 0,Type,carbon_isotopes
0,X,9455.1
1,X,6793.0
2,X,6227.0
3,X,4250.0
4,X,3993.46


## Train Test Split

In [80]:
# split data into X and y
X = C.loc[::1,'carbon_12_13':]
#X = C.loc[::1,'carbon_12_13':]
y = C['Type']
print(X)

       carbon_isotopes
0             9455.100
1             6793.000
2             6227.000
3             4250.000
4             3993.460
5             3290.100
6             3101.020
7             2882.400
8             2807.000
9             2730.000
10            2525.000
11            2509.690
12            2377.190
13            2234.300
14            2224.000
15            1980.800
16            1877.000
17            1848.000
18            1721.000
19            1693.890
20            1621.000
21            1581.000
22            1549.520
23            1528.000
24            1510.100
25            1453.100
26            1413.000
27            1357.410
28            1299.530
29            1296.000
...                ...
15592            2.500
15593            2.460
15594            2.450
15595            2.410
15596            2.370
15597            2.370
15598            2.360
15599            2.350
15600            2.350
15601            2.340
15602            2.328
15603      

In [81]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

## Fit model

In [96]:
# fit model
model = XGBClassifier()
model.fit(X_train, y_train, eval_metric='mae', verbose=1)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=None, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)

In [None]:
loss_MAE_xgd(X_test,y_test)

## Print model

In [83]:
# Print model
print(model)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=None, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)


## Compare predictions to actual

In [84]:
# Make predictions for test data
y_pred = model.predict(X_test)
# Create and print dataframe with predicted and actual types
pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,M,M
1,M,M
2,M,M
3,M,M
4,M,M
5,M,M
6,M,M
7,M,M
8,M,M
9,M,M


## Report Accuracy

In [85]:
# Evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 93.68%


In [92]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
kfold = StratifiedKFold(n_splits=10, random_state=1)
results = cross_val_score(model, X, y, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 9.54% (8.92%)


## Save the model

In [86]:
# Save this model. 
filename = 'XGBoost_C.sav'
joblib.dump(model, filename)

['XGBoost_C.sav']

## Summary: 

### XGBoost with carbon isotopes: 93.7%