In [17]:
import pandas as pd
import numpy as np

# Read in the data
OilSource = pd.read_excel('../Data/OilSourceGeochemicalData.xlsx', sheet_name='oil source correlation')

# Data preprocessing
OilSource = OilSource[OilSource['BD3']+OilSource['BS12']+OilSource['BS3']+OilSource['QD3']+OilSource['QS12']+OilSource['QS3'] == 1]
OilSource = OilSource.dropna(axis=1, thresh=0.9*OilSource.shape[0]).reset_index(drop=True)
print(OilSource.shape)

# Get the chemical compounds and labels
Chemical_Compounds_Oil_None = OilSource.iloc[:, 6:-7]
Labels_Oil = OilSource.iloc[:, -1]

Chemical_Compounds_Oil_None

(161, 107)


Unnamed: 0,C17 n-heptadecane,Pr pristane,C18 n-octadecane,Ph phytane,"5α(H),14β(H)-C21 pregnane",C22 homopregnane,20S-ααα-cholestane-C27,20R-αββ-cholestane-C27,20S-αββ-cholestane-C27,20R-ααα-cholestane-C27,...,C29 triaromatic sterane-6,C29 triaromatic sterane-7,C29 triaromatic sterane-8,C29 triaromatic sterane-9,C28 triaromatic sterane-10,C28 triaromatic sterane-11,C29 triaromatic sterane-12,C29 triaromatic sterane-13,C29 triaromatic sterane-14,C29 triaromatic sterane-15
0,5.797785e+05,2.239026e+05,6.790332e+05,2.356506e+05,1638.479,996.499,5529.045,7336.887,6769.838,11286.543,...,1.775585e+04,1.770056e+04,13060.062000,15114.427000,14988.837000,2.196522e+04,9153.449000,2.054157e+04,1.236306e+04,3.147971e+04
1,1.529140e+06,8.560587e+05,1.830612e+06,9.051154e+05,8725.509,6062.808,26757.087,33365.675,29497.472,47058.916,...,1.912528e+04,2.157131e+04,16234.676000,17664.934000,16462.430000,2.265946e+04,10817.539000,2.111641e+04,1.537440e+04,3.217174e+04
2,3.036959e+08,1.076632e+08,3.216615e+08,1.085969e+08,1028499.004,536726.943,1824787.085,3188949.106,2454566.379,2741782.050,...,8.730336e+03,8.241961e+03,4210.113000,7240.853000,7500.385000,9.936366e+03,3328.928000,9.276243e+03,6.403402e+03,1.436476e+04
3,1.136146e+07,4.589355e+06,1.154116e+07,3.703026e+06,33570.857,17146.426,105491.072,143957.609,124806.427,203237.820,...,1.097447e+06,1.363472e+06,308912.992552,805105.462948,901389.517784,1.142248e+06,372262.756389,1.174602e+06,1.149289e+06,1.836620e+06
4,9.793163e+06,4.076726e+06,9.777026e+06,3.447297e+06,19059.458,9577.565,51409.919,76257.433,67705.718,102764.701,...,1.097447e+06,1.363472e+06,308912.992552,805105.462948,901389.517784,1.142248e+06,372262.756389,1.174602e+06,1.149289e+06,1.836620e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,4.322804e+07,2.540430e+07,4.476840e+04,1.248973e+05,72325.157,43760.112,84666.308,112679.667,77282.875,90035.822,...,1.097447e+06,1.363472e+06,308912.992552,805105.462948,901389.517784,1.142248e+06,372262.756389,1.174602e+06,1.149289e+06,1.836620e+06
157,4.322804e+07,2.540430e+07,5.508364e+07,2.773086e+07,61060.256,31539.432,97932.743,110382.848,74391.810,105680.007,...,1.097447e+06,1.363472e+06,308912.992552,805105.462948,901389.517784,1.142248e+06,372262.756389,1.174602e+06,1.149289e+06,1.836620e+06
158,5.309100e+04,5.372592e+05,9.768602e+04,1.242950e+06,66494.537,38774.564,79134.288,122823.780,77811.507,88952.833,...,1.097447e+06,1.363472e+06,308912.992552,805105.462948,901389.517784,1.142248e+06,372262.756389,1.174602e+06,1.149289e+06,1.836620e+06
159,6.389803e+03,7.774911e+03,7.534790e+03,1.301606e+04,4488.170,2011.267,5157.562,4136.402,4963.639,8910.483,...,8.351000e+03,8.358515e+03,308912.992552,2772.000000,6772.000000,8.499000e+03,2891.000000,6.977000e+03,6.716000e+03,1.335100e+04


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler

# Split the data into training, validation and test sets
X_Oil_Train, X_Oil_Test, Y_Oil_Train, Y_Oil_Test = train_test_split(Chemical_Compounds_Oil_None, Labels_Oil, test_size=0.2, random_state=0)
X_Oil_Train, X_Oil_Val, Y_Oil_Train, Y_Oil_Val = train_test_split(Chemical_Compounds_Oil_None, Labels_Oil, test_size=0.25, random_state=0)

# Scale the data
Scaler_Oil = MinMaxScaler()
X_Oil_Train_Scaled = Scaler_Oil.fit_transform(X_Oil_Train)
X_Oil_Val_Scaled = Scaler_Oil.transform(X_Oil_Val)
X_Oil_Test_Scaled = Scaler_Oil.transform(X_Oil_Test)

In [27]:
from SVM_SelectKBest import SVM_SelectKBest

# Initialize the SVM-SelectKBest model with a specific random state for reproducibility.
svm_selector = SVM_SelectKBest(random_state=1)

# Fit the model to the training data and transform the training, validation, and test sets to select the best features.
X_train_selected = svm_selector.fit_transform(X_Oil_Train_Scaled, Y_Oil_Train, X_Oil_Val_Scaled, Y_Oil_Val)
X_val_selected = svm_selector.transform(X_Oil_Val_Scaled)
X_test_selected = svm_selector.transform(X_Oil_Test_Scaled)

# Predict the labels using the best model obtained from SVM-SelectKBest.
Y_train_pred = svm_selector.best_model_.predict(X_train_selected)
Y_val_pred = svm_selector.best_model_.predict(X_val_selected)
Y_test_pred = svm_selector.best_model_.predict(X_test_selected)

# Output the best 'k' value (number of features) and the best adjusted score (e.g., accuracy or F1 score).
print("Best k:", svm_selector.best_k)
print("Best adjusted score:", svm_selector.best_score)

# Display the selected features and their corresponding scores.
print("Selected features:", X_Oil_Train.columns[svm_selector.indices_[:svm_selector.best_k]])
print("Selected features scores:", svm_selector.scores_[svm_selector.indices_[:svm_selector.best_k]])

Best k: 32
Best adjusted score: 1.0
Selected features: Index(['1,3,7-三甲基萘', 'Pr 姥鲛烷', '22S-17α(H),21β(H)-C35五升藿烷',
       '22R-17α (H),21β(H)-C35五升藿烷', '22S-17α(H),21β(H)-C34四升藿烷', '2-甲基菲',
       '22R-17α(H),21β(H)-C34四升藿烷', '22S-17α(H),21β(H)-C33三升藿烷', 'Ph 植烷',
       '22S-17α(H),21β(H)-C32二升藿烷', '22R-17α(H),21β(H)-C33三升藿烷', 'C29降新霍烷Ts',
       '3-甲基菲', '22R-17α(H),21β(H)-C32二升藿烷', '20R-αββ-胆甾烷-C27',
       '22S-17α(H),21β(H)-C31升藿烷', '22R-17α(H),21β(H)-C31升藿烷', '伽玛蜡烷',
       '17α(H),21β(H)-C30藿烷', '22R-17β(H),21α(H)-升莫烷', '20S-αββ-胆甾烷-C27',
       '20S-24-乙基-αββ-胆甾烷-C29', 'C18 正构十八烷', 'C17 正构十七烷', '20S-ααα-胆甾烷-C27',
       '20R-24-乙基-αββ-胆甾烷-C29', '17α,21β-30-降藿烷', '20R-24-甲基-αββ-胆甾烷-C28',
       '20R-24-丙基-ααα-4甲基甾烷-C30', '18α,-22,29,30-三降藿烷(Ts)', '18α(H)-重排藿烷',
       '20S-24-丙基-αββ-4甲基甾烷-C30'],
      dtype='object')
Selected features scores: [11.0165779   1.70448496  1.46257425  1.43047814  1.33159286  1.31893059
  1.25344662  1.23708175  1.23355643  1.22713022  1.19942301  1.19

In [24]:
# Output the evaluation metrics
print('Train F1 Score: %.3f' % f1_score(Y_Oil_Train, Y_train_pred, average='macro'))
print('Validation F1 Score: %.3f' % f1_score(Y_Oil_Val, Y_val_pred, average='macro'))
print('Test F1 Score: %.3f' % f1_score(Y_Oil_Test, Y_test_pred, average='macro'), end='\n\n')

print('Train Precision: %.3f' % precision_score(Y_Oil_Train, Y_train_pred))
print('Validation Precision: %.3f' % precision_score(Y_Oil_Val, Y_val_pred))
print('Test Precision: %.3f' % precision_score(Y_Oil_Test, Y_test_pred), end='\n\n')

print('Train Recall: %.3f' % recall_score(Y_Oil_Train, Y_train_pred))
print('Validation Recall: %.3f' % recall_score(Y_Oil_Val, Y_val_pred))
print('Test Recall: %.3f' % recall_score(Y_Oil_Test, Y_test_pred))

Train F1 Score: 1.000
Validation F1 Score: 1.000
Test F1 Score: 1.000

Train Precision: 1.000
Validation Precision: 1.000
Test Precision: 1.000

Train Recall: 1.000
Validation Recall: 1.000
Test Recall: 1.000


In [21]:
# Output the best parameters
print('Best Parameters:', svm_selector.best_model_.best_params_)

Best Parameters: {'C': 1000, 'class_weight': None, 'kernel': 'rbf'}


In [22]:
# Output the confusion matrix
print(Y_Oil_Train.values)
print(svm_selector.best_model_.predict(X_train_selected))
print(np.logical_xor(Y_Oil_Train.values, svm_selector.best_model_.predict(X_train_selected)).sum(), Y_Oil_Train.shape[0], end='\n\n')

print(Y_Oil_Val.values)
print(svm_selector.best_model_.predict(X_val_selected))
print(np.logical_xor(Y_Oil_Val.values, svm_selector.best_model_.predict(X_val_selected)).sum(), Y_Oil_Val.shape[0], end='\n\n')

print(Y_Oil_Test.values)
print(svm_selector.best_model_.predict(X_test_selected))
print(np.logical_xor(Y_Oil_Test.values, svm_selector.best_model_.predict(X_test_selected)).sum(), Y_Oil_Test.shape[0])

[0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 1 0 0]
[0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 1 0 0]
0 120

[0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 1 0 0]
[0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 1 0 0]
0 41

[0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0]
[0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0]
0 33

