Calculate the confidence interval of the error by using t-distribution.

In [1]:
import numpy as np
import pandas as pd

import datetime
date = datetime.datetime.now().strftime('%Y%m%d')

# Case study

In [10]:
from joblib import load
from  nwp_cali import PrepareData

measurement = 'CaCO3%'
model = load('models/{}_nmf+svr_model_20210823.joblib'.format(measurement[:-1].lower()))
prepare = PrepareData(measurement=measurement)
data_df = prepare.select_casestudy(case_cores = ['PS75-056-1'])
X, y = prepare.produce_Xy(data_df)

y_df = data_df[['core', 'mid_depth_mm']].copy()
y_df[measurement] = y
y_df['{}_pred'.format(measurement)] = np.exp(model.predict(X))

measurement = 'TOC%'
model = load('models/{}_nmf+svr_model_20210823.joblib'.format(measurement[:-1].lower()))
prepare = PrepareData(measurement=measurement)
data_df = prepare.select_casestudy(case_cores = ['PS75-056-1'])
X, y = prepare.produce_Xy(data_df)

y_df[measurement] = y
y_df['{}_pred'.format(measurement)] = np.exp(model.predict(X))

y_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   core          103 non-null    object 
 1   mid_depth_mm  103 non-null    float64
 2   CaCO3%        103 non-null    float64
 3   CaCO3%_pred   103 non-null    float64
 4   TOC%          103 non-null    float64
 5   TOC%_pred     103 non-null    float64
dtypes: float64(5), object(1)
memory usage: 5.6+ KB


In [11]:
from sklearn.metrics import mean_squared_error

for measurement in ['CaCO3%', 'TOC%']:
    print(
        'RMSE', 
        measurement, 
        mean_squared_error(
            y_df[measurement], 
            y_df['{}_pred'.format(measurement)],
            squared=False)
    )

RMSE CaCO3% 6.658810134456883
RMSE TOC% 0.0562574184997256


In [5]:
from scipy import stats

confidence = 0.95

for measurement in ['CaCO3%', 'TOC%']:
    err = (y_df['{}_pred'.format(measurement)] - y_df[measurement]).values
    mean, sigma = np.mean(err), np.std(err)
    conf_int = stats.t.interval(confidence, df=len(err)-1, loc=mean, scale=sigma)
    print(measurement, conf_int)


CaCO3% (-12.932316583586628, 13.462202501609339)
TOC% (-0.10662418268144226, 0.1158043820480197)


In [13]:
np.var(y_df['TOC%_pred'])

0.003858032613547529

# Test set

In [14]:
y_df = pd.read_csv('results/y_dev_preds_20210823.csv', index_col=0)
y_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 394 entries, 0 to 393
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   CaCO3%       372 non-null    float64
 1   CaCO3%_pred  372 non-null    float64
 2   TOC%         394 non-null    float64
 3   TOC%_pred    394 non-null    float64
dtypes: float64(4)
memory usage: 15.4 KB


In [9]:
from sklearn.metrics import mean_squared_error

for measurement in ['CaCO3%', 'TOC%']:
    print(
        'RMSE',
        measurement, 
        mean_squared_error(
            y_df[measurement].dropna(), 
            y_df['{}_pred'.format(measurement)].dropna(),
            squared=False)
    )

RMSE CaCO3% 3.5701037782558336
RMSE TOC% 0.06579334866534074


In [13]:
from scipy import stats

confidence = 0.95

for measurement in ['CaCO3%', 'TOC%']:

    err = (y_df['{}_pred'.format(measurement)].dropna() - y_df[measurement].dropna()).values
    mean, sigma = np.mean(err), np.std(err)
    conf_int = stats.t.interval(confidence, df=len(err)-1, loc=mean, scale=sigma)
    print(measurement, conf_int)


CaCO3% (-7.0417460546035295, 6.998347385268411)
TOC% (-0.12590889355885268, 0.132471106175832)


In [15]:
np.var(y_df['TOC%_pred'])

0.018987020498692025