Predict the CaCO3 and TOC using the latest models (2021 Aug.) on the whole spetra (2022 June).

In [1]:
import numpy as np 
import pandas as pd

import datetime
date = datetime.datetime.now().strftime('%Y%m%d')
print(date)

20220629


# Read spe dataset and models

In [3]:
spe_df = pd.read_csv('data/spe_dataset_20220629.csv', index_col=0)
spe_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2044,2045,2046,2047,cps,core,composite_depth_mm,section_depth_mm,filename,section
composite_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SO264-09-2_00010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,174740,SO264-09-2,10,10,SO264-09-2_0000 10.0mm 10s 10kV 150uA No-F...,0
SO264-09-2_00020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,176896,SO264-09-2,20,20,SO264-09-2_0000 20.0mm 10s 10kV 150uA No-F...,0
SO264-09-2_00030,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,183148,SO264-09-2,30,30,SO264-09-2_0000 30.0mm 10s 10kV 150uA No-F...,0
SO264-09-2_00040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,185151,SO264-09-2,40,40,SO264-09-2_0000 40.0mm 10s 10kV 150uA No-F...,0
SO264-09-2_00050,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,186255,SO264-09-2,50,50,SO264-09-2_0000 50.0mm 10s 10kV 150uA No-F...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SO264-69-2_18460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,109462,SO264-69-2,18460,910,SO264-69-1_017550 910.0mm 10s 10kV 150uA No...,18
SO264-69-2_18470,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,108611,SO264-69-2,18470,920,SO264-69-1_017550 920.0mm 10s 10kV 150uA No...,18
SO264-69-2_18480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,111142,SO264-69-2,18480,930,SO264-69-1_017550 930.0mm 10s 10kV 150uA No...,18
SO264-69-2_18490,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,109224,SO264-69-2,18490,940,SO264-69-1_017550 940.0mm 10s 10kV 150uA No...,18


In [4]:
X = spe_df.iloc[:, :2048].values
X = X / X.sum(axis = 1, keepdims = True)

In [5]:
from joblib import load
m_caco3 = load('models/caco3_nmf+svr_model_20210823.joblib')
m_toc_svr = load('models/toc_nmf+svr_model_20210823.joblib')

# Predict

In [6]:
y_caco3 = np.exp(m_caco3.predict(X))
y_toc_svr = np.exp(m_toc_svr.predict(X))

# Build dataset

In [7]:
predict_df = spe_df.iloc[:, -5:].copy()
predict_df['CaCO3 prediction (wt%)'] = y_caco3
predict_df['TOC prediction (wt%)'] = y_toc_svr

# Check

In [8]:
mask = (predict_df['CaCO3 prediction (wt%)'] > 100) | (predict_df['TOC prediction (wt%)'] > 100) 

print('There are {} ({:.2f} %) predictions having values over 100.'.format(len(predict_df[mask]), len(predict_df[mask])/len(predict_df)*100))

There are 410 (0.69 %) predictions having values over 100.


In [9]:
predict_df.to_csv('results/predict_{}.csv'.format(date))
print(date)

20220629
