## Imports 
Note: python3. Please install requirements using requirments.txt in main directory. 

In [7]:
import sys
import glob
import fnmatch
import os.path
import pandas as pd
# import API one directory above
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"),os.path.pardir)))
from distinguish_nat_vs_tech import distinguish_nat_vs_tech
from sklearn.model_selection import GridSearchCV

## Directory structure
Note: we generically define directory so it will work on any OS: mac/pc/linux.

In [8]:
DATABASES_BASEPATH = os.path.join(os.path.dirname("__file__"))
NATURAL_TRAINING_DATABASE_NAME_ = 'natural_training_data.csv'
TECHNICAL_TRAINING_DATABASE_NAME_ = 'technical_training_data.csv'
IMPORT_TRAINING_DATABASE_PATH = os.path.join(DATABASES_BASEPATH,'training_data')
IMPORT_TESTING_DATABASE_PATH = os.path.join(DATABASES_BASEPATH,'test_data')
OUTPUT_DATA_SUMMARY_PATH = os.path.join(os.path.dirname("__file__"), 'output')

## Import training files, combine, and concatenate into dataframes.

In [9]:
os.chdir(IMPORT_TRAINING_DATABASE_PATH)

# find all csv's in directory
training_files = glob.glob('*.csv')

# iterate through files and assign classification id
for file in training_files:
    if fnmatch.fnmatchcase(file, TECHNICAL_TRAINING_DATABASE_NAME_):
        technical_training_database = pd.DataFrame.from_csv(
            os.path.join(file),header=0, index_col=None)
        
        # assign classification id
        technical_training_database['Classification'] = 0
        
    elif fnmatch.fnmatchcase(file, NATURAL_TRAINING_DATABASE_NAME_):
        natural_training_database = pd.DataFrame.from_csv(
            os.path.join(file), header=0, index_col=None)
        
        # assign classification id
        natural_training_database['Classification'] = 1

# concatenate all the data into a single file
training_data = pd.concat([natural_training_database, technical_training_database])

# remoove all the na values (other filtering done later)
training_data = training_data.dropna()

FileNotFoundError: [Errno 2] No such file or directory: 'training_data'

In [18]:
ISOTOPE_LIST_ = ['107Ag', '109Ag', '139La', '140Ce', '141Pr', '143Nd',
                 '146Nd', '147Sm', '149Sm', '153Eu', '157Gd', '159Tb',
                 '182W', '206Pb', '208Pb', '232Th', '238U', '25Mg',
                 '55Mn', '59Co', '60Ni', '65Cu', '66Zn', '88Sr',
                 '90Zr', '93Nb', '95Mo']

# initialize class
nat_v_tech = distinguish_nat_vs_tech()

# initialize all isotopes
nat_v_tech.isotope_list = ISOTOPE_LIST_

# filter negative data
nat_v_tech.filter_negative(training_data = training_data)

# apply detection threshold
nat_v_tech.apply_detection_threshold(training_data = nat_v_tech.training_data,
                                     threshold_value= 5)

# split target data from training data 
nat_v_tech.split_target_from_training_data(training_data=nat_v_tech.training_data)

# prepare data for ML
nat_v_tech.prepare_data_for_ML(training_data = nat_v_tech.training_data,
                               target_data=nat_v_tech.target_data)

# initialize gbc to determine max estimators with least overfitting
GBC_INIT_PARAMS = {'loss': 'deviance', 'learning_rate': 0.1,
                   'min_samples_leaf': 100, 'n_estimators': 1000,
                   'max_depth': 5, 'random_state': None, 'max_features': 'sqrt'}

nat_v_tech.find_min_boosting_stages(gbc_init_params=GBC_INIT_PARAMS)

print (nat_v_tech.optimum_boosting_stages)
        

417


In [None]:
ISOTOPE_LIST_ = ['107Ag', '109Ag', '139La', '140Ce', '141Pr', '143Nd',
                 '146Nd', '147Sm', '149Sm', '153Eu', '157Gd', '159Tb',
                 '182W', '206Pb', '208Pb', '232Th', '238U', '25Mg',
                 '55Mn', '59Co', '60Ni', '65Cu', '66Zn', '88Sr',
                 '90Zr', '93Nb', '95Mo']

CRITICAL_ISOTOPE_LIST_ = ['140Ce', '139La',
                          '88Sr']

GBC_GRID_SEARCH_PARAMS = {'loss': ['exponential', 'deviance'],
                          'learning_rate': [0.01, 0.1],
                          'min_samples_leaf': [50, 100],
                          'random_state': [None],
                          'max_features': ['sqrt', 'log2'],
                          'max_depth': [5]}  # note n_estimators automatically set

