# Ablation study of Tsec integration and Tsec feature

**We start by importing datasets generated when integrating to Tsec and when integrating to 1e4 orbits**

Data generated using rebound 4.3.2

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import hyperopt
import sys
sys.path.append('../spock/')
try:
    plt.style.use('paper')
except:
    pass
%matplotlib inline

In [2]:
#load datapath for clean training data that is generated
datapath = '../../cleanData/csvs/resonant/'
oldData = pd.read_csv(datapath+'1e4data.csv')
TsecData = pd.read_csv(datapath+'Tsecdata.csv')
#remove junk columns
oldData = oldData.drop(columns=['Unnamed: 0','Unnamed: 0.1'])
TsecData = TsecData.drop(columns=['Unnamed: 0','Unnamed: 0.1'])

Define helper functions

In [3]:
def train_test(dataset,features):
    '''divides the dataset into 80% training and 20% testing and returns training/testing sets'''
    Nrows = int(0.8*dataset.shape[0])
    train = dataset.iloc[:Nrows, :]
    test = dataset.iloc[Nrows:, :]
    #remove testing systems that go unstable in 1e4 orbits

    test = test.drop(test[test['InitialStable']==False].index)
    return train[features], train['Stable'], test[features], test['Stable']

We can create sub datasets to consider each combination of Tsec feature addition and integration

In [4]:
near = ['EMcrossnear', 'EMfracstdnear', 'EPstdnear', 'MMRstrengthnear']
far = ['EMcrossfar', 'EMfracstdfar', 'EPstdfar', 'MMRstrengthfar']
megno = ['MEGNO', 'MEGNOstd']

TsecFeat = near + far + megno + ['Tsec']
oldFeat = near + far + megno 

train1e4X, train1e4Y, test1e4X, test1e4Y = train_test(oldData, oldFeat)
train1e4TsecX, train1e4TsecY, test1e4TsecX, test1e4TsecY = train_test(oldData, TsecFeat)
trainTintX, trainTintY, testTintX, testTintY = train_test(TsecData, oldFeat)
trainTsecX, trainTsecY, testTsecX, testTsecY = train_test(TsecData, TsecFeat)

We can calculating the training balance to account for frequency bias with each dataset

We can define some helper functions

In [5]:
from sklearn import metrics
from sklearn.metrics import roc_curve, confusion_matrix, auc
def ROC_curve( model, x,y):
    preds = model.predict_proba(x)[:,1]
    fpr, tpr, ROCthresholds = roc_curve(y, preds)
    roc_auc = metrics.roc_auc_score(y, preds)
    return roc_auc, fpr, tpr, ROCthresholds

In [6]:
def getPreformance(trainX, trainY, testX, testY, label):
    model = XGBClassifier(learning_rate = 0.05, 
                         max_depth = 13, 
                         subsample = 0.95,
                         min_child_weight = 5,
                         n_estimators = 100,
                         )
    model.fit(trainX,trainY)

    roc_auc, fpr, tpr, ROCthresholds = ROC_curve(model, testX, testY)

    tprthreshindex = np.where(tpr >=0.9)[0][0]
    falsePos = fpr[tprthreshindex]

    return label, roc_auc, falsePos

**We can now collect the AUC of a model using each permutation of Tsec feature and integration.**
Note, FPR (false positive rate) is evaluated when TPR is 0.9

In [7]:
results = np.array([getPreformance(train1e4X, train1e4Y, test1e4X, test1e4Y, 'Int to 1e4 with old features'),
getPreformance(train1e4TsecX, train1e4TsecY, test1e4TsecX, test1e4TsecY, 'Int to 1e4 with Tsec as feature'),
getPreformance(trainTintX, trainTintY, testTintX, testTintY, 'Int to Tsec with old features'),
getPreformance(trainTsecX, trainTsecY, testTsecX, testTsecY, 'Int to Tsec with Tsec as feature')])

In [8]:
displayResults = pd.DataFrame(results, columns = ['Model comparisons', 'AUC', 'FPR'])

# Displayed sorted model comparisons

In [9]:
displayResults.sort_values(by='AUC', ascending=True, inplace=False)

Unnamed: 0,Model comparisons,AUC,FPR
2,Int to Tsec with old features,0.941732249925601,0.1660059809444328
0,Int to 1e4 with old features,0.9429655885741256,0.1604654403567447
1,Int to 1e4 with Tsec as feature,0.9442496015568056,0.1594899665551839
3,Int to Tsec with Tsec as feature,0.9502022027554688,0.1431253911954934
