In [42]:
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import make_scorer

In [39]:
from spectroscopy.model import load_model, mean_absolute_percentage_error
from spectroscopy.utils import load_training_data, get_wavelength_columns

In [3]:
model = load_model()

In [5]:
type(model)

sklearn.pipeline.Pipeline

In [12]:
data = load_training_data()

In [13]:
data.head()

Unnamed: 0,862.13,863.88,865.63,867.38,869.13,870.88,872.63,874.38,876.13,877.88,...,1749.38,1751.13,1752.88,extra_info,filename,sample_name,sample_date,run_number,process_method,Ammonia-N
0,0.001,0.001,0.001,-205.52,-785.09,-407.91,44.607,46.916,46.875,45.709,...,40.831,40.827,40.896,File: Users\stell\Desktop\StellarDat\South 1 ...,South 1 - 7-8-20-20 #3.TRM,south 1,2020-07-08 20:00:00,3,,0.21
1,0.001,0.001,0.001,87.196,378.95,286.15,62.212,63.387,62.82,61.35,...,56.899,57.027,57.145,File: Users\stell\Desktop\StellarDat\South 15...,South 15 - 7-8-20-20 #1.TRM,south 15,2020-07-08 20:00:00,1,,0.42
2,0.001,0.001,0.001,-195.6,-751.71,-395.01,44.734,47.0,46.914,45.672,...,40.818,40.884,41.02,File: Users\stell\Desktop\StellarDat\South 1 ...,South 1 - 7-8-20-20 #2.TRM,south 1,2020-07-08 20:00:00,2,,0.21
3,0.001,0.001,0.001,90.071,391.93,295.6,62.392,63.609,63.036,61.592,...,57.171,57.359,57.487,File: Users\stell\Desktop\StellarDat\South 15...,South 15 - 7-8-20-20 #3.TRM,south 15,2020-07-08 20:00:00,3,,0.42
4,0.001,0.001,0.001,99.653,429.7,317.57,62.54,63.678,63.054,61.546,...,57.096,57.216,57.309,File: Users\stell\Desktop\StellarDat\South 15...,South 15 - 7-8-20-20 #2.TRM,south 15,2020-07-08 20:00:00,2,,0.42


In [25]:
feature_columns = get_wavelength_columns(data)
X, y = data[feature_columns], data['Ammonia-N']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [29]:
param_grid = {
    'model__max_depth':np.arange(10,100,10),
    'model__n_estimators':np.arange(10,100,10)
}

In [43]:
search = GridSearchCV(model, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False))

In [44]:
search.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('feature_selector',
                                        SelectFromModel(estimator=RandomForestRegressor(max_depth=20,
                                                                                        random_state=10))),
                                       ('model',
                                        RandomForestRegressor(max_depth=20,
                                                              random_state=10))]),
             param_grid={'model__max_depth': array([10, 20, 30, 40, 50, 60, 70, 80, 90]),
                         'model__n_estimators': array([10, 20, 30, 40, 50, 60, 70, 80, 90])},
             scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False))

In [33]:
search.best_score_

0.66268589558576

In [34]:
search.cv_results_

{'mean_fit_time': array([0.42223973, 0.41715631, 0.44965248, 0.4750061 , 0.5274116 ,
        0.52487011, 0.57913857, 0.61309299, 0.63265715, 0.45747724,
        0.45194049, 0.44615521, 0.48202562, 0.49969726, 0.53195615,
        0.52794032, 0.51884136, 0.5628695 , 0.42331753, 0.44543996,
        0.50579162, 0.53421335, 0.53599143, 0.53046031, 0.53222747,
        0.56266799, 0.55838175, 0.41126299, 0.43450103, 0.4371397 ,
        0.48516178, 0.48762059, 0.57541094, 0.62688437, 0.63275881,
        0.63600254, 0.44536886, 0.45798855, 0.45799332, 0.50636239,
        0.55761952, 0.52620525, 0.53571868, 0.58342052, 0.6398705 ,
        0.44238596, 0.46843257, 0.49684148, 0.54856758, 0.60222292,
        0.77039356, 0.619063  , 0.61187086, 0.56806703, 0.41390381,
        0.44711156, 0.48061113, 0.50588956, 0.57456284, 0.55283141,
        0.53540716, 0.58450289, 0.5907886 , 0.47974834, 0.50111351,
        0.50755863, 0.51157241, 0.54362106, 0.72465796, 0.9284059 ,
        0.58313503, 0.60430202,

In [None]:
# TODO: TRY PCA on wavelengths, try adding absorbance