## Imports 
Note: python3. Please install requirements using requirments.txt in main directory. 

In [1]:
import sys
import glob
import fnmatch
import os.path
import pandas as pd
# import API one directory above
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"),os.path.pardir)))
from NatVsTech import NatVsTech
from sklearn.model_selection import GridSearchCV

## Directory structure
Note: we generically define directory so it will work on any OS: mac/pc/linux.
Note: drop the "" around "__file__" when in a regular python file. 

In [2]:
PARENT_PATH = os.path.abspath(os.path.join(os.path.dirname("__file__"),os.path.pardir))
DATABASES_BASEPATH = os.path.abspath(os.path.join(os.path.dirname("__file__"),'databases'))
IMPORT_TRAINING_DATABASE_PATH = os.path.abspath(
    os.path.join(DATABASES_BASEPATH, 'training_data'))
IMPORT_TESTING_DATABASE_PATH = os.path.abspath(
    os.path.join(DATABASES_BASEPATH,'test_data'))
OUTPUT_DATA_SUMMARY_PATH = os.path.abspath(
    os.path.join(os.path.dirname("__file__"), 'output'))


# print the paths, just to make sure things make sense
print(PARENT_PATH)
print(DATABASES_BASEPATH)
print(IMPORT_TRAINING_DATABASE_PATH)
print (OUTPUT_DATA_SUMMARY_PATH)



/Users/tron/Praetorius_Goldberg_2016
/Users/tron/Praetorius_Goldberg_2016/examples/databases
/Users/tron/Praetorius_Goldberg_2016/examples/databases/training_data
/Users/tron/Praetorius_Goldberg_2016/examples/output


## Training files
Import training files, combine, and concatenate into dataframes. 
Note: if you re-run the notebook without resetting the kernal, you'll get an error. Restart the notebook kernal and it will work. 

In [3]:
# set the natural and technical database training file names
NATURAL_TRAINING_DATABASE_NAME_ = 'natural_training_data.csv'
TECHNICAL_TRAINING_DATABASE_NAME_ = 'technical_training_data.csv'

# change the directory to the import training data path
os.chdir(IMPORT_TRAINING_DATABASE_PATH)

# find all csv's in the directory
training_files = glob.glob('*.csv')

# iterate through files and assign classification id
for file in training_files:
    if fnmatch.fnmatchcase(file, TECHNICAL_TRAINING_DATABASE_NAME_):
        technical_training_database = pd.DataFrame.from_csv(
            os.path.join(file),header=0, index_col=None)
        
        # assign classification id
        technical_training_database['classification'] = 0
        
    elif fnmatch.fnmatchcase(file, NATURAL_TRAINING_DATABASE_NAME_):
        natural_training_database = pd.DataFrame.from_csv(
            os.path.join(file), header=0, index_col=None)
        
        # assign classification id
        natural_training_database['classification'] = 1

print (training_files)
# concatenate all the data into a single file
training_data = pd.concat([natural_training_database, 
                           technical_training_database])

# remoove all the na values (other filtering done later)
training_data = training_data.dropna()

['natural_training_data.csv', 'technical_training_data.csv']


## Using the API
Before you can use the API, you have to initialize the class. We'll then work through how the data is easily filtered, stored, and used for training and prediction. 

In [4]:
# initialize class
nat_v_tech = NatVsTech()

print (nat_v_tech)
        

<NatVsTech.NatVsTech object at 0x10bab5400>


In [5]:
# filter the data of negative values
neg_filt_training_data = nat_v_tech.filter_negative(data=training_data)

# threshold the data with a single isotope trigger
thresh_neg_filt_training_data = nat_v_tech.apply_detection_threshold(
                                    data=neg_filt_training_data, 
                                    threshold_value=5)

# print to maake sure we're on target
print (thresh_neg_filt_training_data.head())

     25Mg    55Mn    59Co    60Ni   65Cu    66Zn    88Sr    90Zr    93Nb  \
0  2.6333  0.0000  0.0000  0.7809  0.000  1.6047  6.2237  0.0000  0.0000   
1  1.5857  1.3047  0.0000  0.7762  0.000  0.0000  1.2381  0.0000  1.1667   
2  0.0000  0.0000  0.0000  0.6381  1.719  0.0000  0.0000  1.4714  0.0000   
3  5.3095  1.2286  0.8476  0.0000  0.000  0.0000  2.6285  0.0000  0.0000   
4  0.0000  1.2428  0.0000  0.0000  0.000  0.6095  2.2666  0.0000  0.0000   

     95Mo       ...         149Sm   153Eu   157Gd  159Tb    182W   206Pb  \
0  0.0000       ...        0.0000  0.0000  0.0000    0.0  0.7095  1.0524   
1  1.3143       ...        0.0000  0.1667  0.0000    0.0  0.9571  0.5333   
2  1.7619       ...        0.0000  0.0000  0.3476    0.0  0.1095  0.4333   
3  0.0000       ...        1.3952  0.0000  0.0000    0.0  0.4524  1.3809   
4  0.0000       ...        0.0000  1.0000  0.0000    0.0  0.0000  1.0714   

    208Pb   232Th    238U  classification  
0  2.3904  1.0809  0.0000               1 

In [6]:
# right now training data contains the classification data. Split it. 
(training_df, target_df) = nat_v_tech.split_target_from_training_data(
                                df=thresh_neg_filt_training_data)


# print training data to check structure
print (training_df.head())

# print target data to check structure
print (target_df.head())

     25Mg    55Mn    59Co    60Ni   65Cu    66Zn    88Sr    90Zr    93Nb  \
0  2.6333  0.0000  0.0000  0.7809  0.000  1.6047  6.2237  0.0000  0.0000   
1  1.5857  1.3047  0.0000  0.7762  0.000  0.0000  1.2381  0.0000  1.1667   
2  0.0000  0.0000  0.0000  0.6381  1.719  0.0000  0.0000  1.4714  0.0000   
3  5.3095  1.2286  0.8476  0.0000  0.000  0.0000  2.6285  0.0000  0.0000   
4  0.0000  1.2428  0.0000  0.0000  0.000  0.6095  2.2666  0.0000  0.0000   

     95Mo   ...     147Sm   149Sm   153Eu   157Gd  159Tb    182W   206Pb  \
0  0.0000   ...    0.0000  0.0000  0.0000  0.0000    0.0  0.7095  1.0524   
1  1.3143   ...    1.0667  0.0000  0.1667  0.0000    0.0  0.9571  0.5333   
2  1.7619   ...    0.0000  0.0000  0.0000  0.3476    0.0  0.1095  0.4333   
3  0.0000   ...    0.0000  1.3952  0.0000  0.0000    0.0  0.4524  1.3809   
4  0.0000   ...    0.0000  0.0000  1.0000  0.0000    0.0  0.0000  1.0714   

    208Pb   232Th    238U  
0  2.3904  1.0809  0.0000  
1  1.9476  0.0000  0.0095  
2 

In [8]:
# conform the test data for ML and store it as X and y.
# (X, y) = nat_v_tech.conform_data_for_ML(training_df=training_df, target_df=target_df)

# initialize gbc parameters to determine max estimators with least overfitting
GBC_INIT_PARAMS = {'loss': 'deviance', 'learning_rate': 0.1,
				   'min_samples_leaf': 100, 'n_estimators': 1000,
				   'max_depth': 5, 'random_state': None, 'max_features': 'sqrt'}

# print to verify parameter init structure
print (GBC_INIT_PARAMS)

# outline grid search parameters
# set optimum boosting stages. Note: n_estimators automatically set
GBC_GRID_SEARCH_PARAMS = {'loss': ['exponential', 'deviance'],
						  'learning_rate': [0.01, 0.1],
						  'min_samples_leaf': [50, 100],
						  'random_state': [None],
						  'max_features': ['sqrt', 'log2'],
						  'max_depth': [5],
						  'n_estimators': [100]}  

print (GBC_GRID_SEARCH_PARAMS)

# determining optimum feature selection with rfecv 
nat_v_tech.rfecv_feature_identify(training_df = training_df,target_df = target_df,
                                   cv_grid_search=GBC_GRID_SEARCH_PARAMS, 
                                   gbc_init_params =GBC_INIT_PARAMS,
                                   n_splits = 3)



{'max_features': 'sqrt', 'learning_rate': 0.1, 'min_samples_leaf': 100, 'n_estimators': 1000, 'loss': 'deviance', 'random_state': None, 'max_depth': 5}
{'learning_rate': [0.01, 0.1], 'min_samples_leaf': [50, 100], 'max_features': ['sqrt', 'log2'], 'loss': ['exponential', 'deviance'], 'n_estimators': [100], 'random_state': [None], 'max_depth': [5]}
GradientBoostingClassifierrWithCoef(criterion='friedman_mse', init=None,
                  learning_rate=0.1, loss='deviance', max_depth=5,
                  max_features='sqrt', max_leaf_nodes=None,
                  min_impurity_split=1e-07, min_samples_leaf=100,
                  min_samples_split=2, min_weight_fraction_leaf=0.0,
                  n_estimators=1000, presort='auto', random_state=None,
                  subsample=1.0, verbose=0, warm_start=False)


ValueError: not enough values to unpack (expected 6, got 0)

{'random_state': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_features': 'sqrt', 'n_estimators': 1000, 'min_samples_leaf': 100, 'max_depth': 5}


In [8]:
# find optimum boosting stages
optimum_boosting_stages = nat_v_tech.find_min_boosting_stages(gbc_base_params=GBC_INIT_PARAMS,
															  training_df=training_df,
															  target_df=target_df)[1]

# print optimum boosting stages
print (optimum_boosting_stages)

410


In [9]:
# create grid search parameters in which to find the optimum set, 
# set optimum boosting stages. Note: n_estimators automatically set
GBC_GRID_SEARCH_PARAMS = {'loss': ['exponential', 'deviance'],
						  'learning_rate': [0.01, 0.1],
						  'min_samples_leaf': [50, 100],
						  'random_state': [None],
						  'max_features': ['sqrt', 'log2'],
						  'max_depth': [5],
						  'n_estimators': [optimum_boosting_stages]}  

# print search parameter grid to verify init structure
print (GBC_GRID_SEARCH_PARAMS)

{'random_state': [None], 'learning_rate': [0.01, 0.1], 'loss': ['exponential', 'deviance'], 'max_features': ['sqrt', 'log2'], 'n_estimators': [410], 'min_samples_leaf': [50, 100], 'max_depth': [5]}


In [10]:
# find the optimum gbc parameters
gbc_fitted = nat_v_tech.find_optimum_gbc_parameters(crossfolds=5,
													training_df=training_df,
													target_df=target_df,
													gbc_search_params=GBC_GRID_SEARCH_PARAMS)

# print the optimum gbc structure
print (gbc_fitted)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=5,
              max_features='log2', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=50,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=410, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)


In [12]:


# use the X and y data to train the model. Then test the trained model against the test data and output results.
nat_v_tech.apply_trained_classification(test_data_path=IMPORT_TESTING_DATABASE_PATH,
										output_summary_data_path=OUTPUT_DATA_SUMMARY_PATH,
										output_summary_base_name='summary.csv',
										track_class_probabilities=[0.1, 0.1],
										isotope_trigger='140Ce',
										gbc_fitted=gbc_fitted,
										X=X, y=y)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 1