In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.insert(0, module_path)
from pathlib import Path

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
from spectroscopy.utils import load_training_data, plot_residuals, get_wavelength_columns, parse_trm_files

In [4]:
from spectroscopy.model import load_model, get_features

In [5]:
df_trms = parse_trm_files()
# fill negative values of trms
# wavelength_columns = get_wavelength_columns(df_trms)
# df_trms[df_trms[wavelength_columns] < 0][wavelength_columns] = 0
num = df_trms._get_numeric_data()

num[num < 0] = 0

In [6]:
num[num < 0]

Unnamed: 0,862.13,863.88,865.63,867.38,869.13,870.88,872.63,874.38,876.13,877.88,...,1737.13,1738.88,1740.63,1742.38,1744.13,1745.88,1747.63,1749.38,1751.13,1752.88
0,,,,,,,,,,,...,,,,,,,,,,
0,,,,,,,,,,,...,,,,,,,,,,
0,,,,,,,,,,,...,,,,,,,,,,
0,,,,,,,,,,,...,,,,,,,,,,
0,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,,,,,,,,,,...,,,,,,,,,,
0,,,,,,,,,,,...,,,,,,,,,,
0,,,,,,,,,,,...,,,,,,,,,,
0,,,,,,,,,,,...,,,,,,,,,,


In [7]:
model = load_model(model_dir=Path().cwd().parent/'bin'/'model')

In [8]:
data = load_training_data()

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Columns: 519 entries, 862.13 to filename_lr
dtypes: float64(511), int64(2), object(6)
memory usage: 811.1+ KB


In [10]:
data.describe()

Unnamed: 0,862.13,863.88,865.63,867.38,869.13,870.88,872.63,874.38,876.13,877.88,...,1742.38,1744.13,1745.88,1747.63,1749.38,1751.13,1752.88,integration_time,run_number,Ammonia-N
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,...,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,0.001,0.001,0.001,9.901022,38.542137,32.564984,58.357895,60.640615,60.167605,58.18619,...,53.611455,53.735745,53.82563,53.89462,53.97089,54.062445,54.17917,40.3,1.885,0.3603
std,2.173846e-19,2.173846e-19,2.173846e-19,20.351293,86.473192,65.745699,11.230318,11.456488,11.405822,11.201562,...,10.802835,10.800146,10.807613,10.819466,10.825458,10.828973,10.825276,2.728962,0.833817,0.305992
min,0.001,0.001,0.001,0.0,0.0,0.0,15.144,15.92,15.825,15.246,...,15.858,15.925,15.931,15.938,15.998,16.056,16.092,35.0,1.0,0.07
25%,0.001,0.001,0.001,0.0,0.0,0.0,53.98175,56.36375,55.7225,53.73525,...,48.947,49.0355,49.0865,49.127,49.1535,49.188,49.31575,40.0,1.0,0.2
50%,0.001,0.001,0.001,0.0,0.0,0.0,59.711,62.551,61.969,59.574,...,53.961,54.1095,54.2105,54.2585,54.314,54.39,54.477,40.0,2.0,0.32
75%,0.001,0.001,0.001,12.8385,39.724,41.2805,66.054,68.343,67.79675,65.64575,...,61.2195,61.31275,61.422,61.478,61.49425,61.62225,61.70975,40.0,3.0,0.44
max,0.001,0.001,0.001,101.09,464.49,369.44,76.971,78.993,78.589,76.93,...,73.553,73.608,73.651,73.713,73.801,73.876,73.898,45.0,3.0,2.24


In [11]:
data['sample_name'].value_counts()

hampton west 2     10
hampton east 2      9
south 16            6
south 17            6
south 15            6
south 2             6
south 1             6
hampton west        6
south 9             6
hampton west 1      4
south 8             4
north 2             4
north 3             4
south 3             4
north 4             4
north 5             4
north 1             4
north 7             4
north 6             4
south 4             4
south 7             4
north 9             4
south 11            4
south 18            4
south 12            4
south 10            4
south 5             4
south 6             4
north 8             4
south 13            4
south 14            4
north 10            3
north 12            3
dows dry            3
north 11            3
dows wet            3
hampton coulter     3
iowa falls          3
west 13             3
north 18            3
north 14            3
hampton east 1      3
west 12             3
north 16            3
north 13            3
west 14   

In [12]:
feature_columns = get_features(data)
X, y = data[feature_columns], data['Ammonia-N']

KeyError: "[nan, 'ground', 'moist'] not in index"

In [None]:
feature_columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
y_train.hist()

In [None]:
y_test.hist()

In [None]:
plot_residuals(y_test, model.predict(X_test))

In [None]:
len(X), len(X_test), len(X_train)

In [None]:
assert (X_test.index == y_test.index).all()

# view highest error samples

In [13]:
def rank_error(model, X, y_true, original_data, n_instances=50):
    y_pred = model.predict(X)
    # calculate error metrics
    residuals = y_true - y_pred
    abs_residuals = np.abs(residuals)
    # get indices of highest error
    abs_res_indices = np.argsort(abs_residuals)[::-1][:n_instances]
    # pull training samples with highest error
    X_ranked = X.iloc[abs_res_indices]
    # pull original data corresponding to those training samples
    high_error_data = data.iloc[X_ranked.index].copy()
    high_error_data['predicted_Ammonia-N'] = y_pred[abs_res_indices]
    high_error_data['absolute_residual'] = abs_residuals.iloc[abs_res_indices]
    high_error_data['residual'] = residuals.iloc[abs_res_indices]
    return high_error_data

In [14]:
data_ranked = rank_error(model, X_test, y_test, data)

NameError: name 'X_test' is not defined

In [15]:
data_ranked

NameError: name 'data_ranked' is not defined

In [16]:
data_ranked.to_csv('data_ranked_by_residuals.csv')

NameError: name 'data_ranked' is not defined

In [17]:
X['process_method']

NameError: name 'X' is not defined

In [70]:
df = load_training_data()
# df = df[df['process_method'] == 'ground']
df['process_method'].fillna('none', inplace=True)
df['process_method'] = df['process_method'].astype(str)

In [71]:
len(df)

200

In [72]:
df['process_method'].value_counts(dropna=False)

none      132
ground     53
moist      15
Name: process_method, dtype: int64

In [73]:
df['process_method'].unique()

array(['none', 'moist', 'ground'], dtype=object)

In [74]:
pd.get_dummies(df['process_method'])

Unnamed: 0,ground,moist,none
0,0,0,1
1,0,0,1
2,0,1,0
3,0,0,1
4,1,0,0
...,...,...,...
195,1,0,0
196,1,0,0
197,0,0,1
198,0,0,1


In [75]:
df = pd.concat([df, pd.get_dummies(df['process_method'])], axis=1)

In [76]:
df['process_method'].unique()

array(['none', 'moist', 'ground'], dtype=object)

In [77]:
feature_columns = get_features(df)
feature_columns

['862.13',
 '863.88',
 '865.63',
 '867.38',
 '869.13',
 '870.88',
 '872.63',
 '874.38',
 '876.13',
 '877.88',
 '879.63',
 '881.38',
 '883.13',
 '884.88',
 '886.63',
 '888.38',
 '890.13',
 '891.88',
 '893.63',
 '895.38',
 '897.13',
 '898.88',
 '900.63',
 '902.38',
 '904.13',
 '905.88',
 '907.63',
 '909.38',
 '911.13',
 '912.88',
 '914.63',
 '916.38',
 '918.13',
 '919.88',
 '921.63',
 '923.38',
 '925.13',
 '926.88',
 '928.63',
 '930.38',
 '932.13',
 '933.88',
 '935.63',
 '937.38',
 '939.13',
 '940.88',
 '942.63',
 '944.38',
 '946.13',
 '947.88',
 '949.63',
 '951.38',
 '953.13',
 '954.88',
 '956.63',
 '958.38',
 '960.13',
 '961.88',
 '963.63',
 '965.38',
 '967.13',
 '968.88',
 '970.63',
 '972.38',
 '974.13',
 '975.88',
 '977.63',
 '979.38',
 '981.13',
 '982.88',
 '984.63',
 '986.38',
 '988.13',
 '989.88',
 '991.63',
 '993.38',
 '995.13',
 '996.88',
 '998.63',
 '1000.38',
 '1002.13',
 '1003.88',
 '1005.63',
 '1007.38',
 '1009.13',
 '1010.88',
 '1012.63',
 '1014.38',
 '1016.13',
 '1017.88',

In [26]:
df['process_method'].isna()

0      False
1      False
2      False
3      False
4      False
       ...  
195     True
196     True
197     True
198     True
199     True
Name: process_method, Length: 400, dtype: bool

In [None]:
df['Moisture']