In [269]:
import os
homedir = os.getenv("HOME")

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.tree import DecisionTreeClassifier

from pandas import read_csv
from astropy.table import Table
import numpy as np

In [270]:
w3_params = read_csv(homedir+'/Desktop/galfit_files/vf_v2_galfit_W3-fixBA-ML.csv')

In [282]:
dat = w3_params.values
x = dat[:,1:18]   #isolate features in line-matched rows
y = dat[:,18]     #isolate "classes" (i.e., whether or not there is a numerical error)

In [292]:
#create train and test set for data; the test_size=0.50 indicates that I am wanting 50% split
#of the data between the two types.
#random state --> splits 'randomly' in a way that is reproducible for all who use the same integer

#xfold1, xfold2, yfold1, yfold2 = train_test_split(x,y.astype('bool'),test_size=0.50, random_state=42)
xfold1 = x[:int(len(x)/2)]
xfold2 = x[int(len(x)/2):]
yfold1 = y[:int(len(x)/2)].astype('bool')
yfold2 = y[int(len(x)/2):].astype('bool')

In [320]:
def train_data_dectree(xfold1,xfold2,yfold1,yfold2):
    #overfitting tendency
    #unstable; small variations in data might resultin completely different tree
    #training algorithms do not guarantee globally optimal decision trees
    #anyhow.
    model=DecisionTreeClassifier(random_state=42)
    model.fit(xfold1, yfold1) #first fold training
    pred1 = model.predict(xfold2) #first fold testing
    model.fit(xfold2,yfold2) #second fold training
    pred2 = model.predict(xfold1) #second fold testing
    actual_dec = np.concatenate([yfold1,yfold2])
    pred_dec = np.concatenate([pred2,pred1])

    print('Decision Tree')
    print(f'Overall Accuracy: {np.round(accuracy_score(actual_dec,pred_dec),3)*100}%')
    print('Confusion Matrix: ')
    print(confusion_matrix(actual_dec,pred_dec))
    return actual_dec, pred_dec

In [321]:
actual_dec, pred_dec = train_data_dectree(xfold1,xfold2,yfold1,yfold2)

Decision Tree
Overall Accuracy: 99.6%
Confusion Matrix: 
[[871   2]
 [  2 120]]


In [323]:
#False Negatives
w3_params['VFID'][(~pred_dec) & (actual_dec)]

7      VFID0020
323    VFID1520
Name: VFID, dtype: object

In [331]:
w3_params[w3_params['VFID']=='VFID0020']

Unnamed: 0,VFID,CXC,CXC_ERR,CYC,CYC_ERR,CMAG,CMAG_ERR,CRE,CRE_ERR,CN,CN_ERR,CAR,CAR_ERR,CPA,CPA_ERR,CSKY,CSKY_ERR,CCHI2NU,CNumerical_Error
7,VFID0020,-47.3049,3030021000000.0,-33.8367,7884976000000.0,27.8689,0.0,608.9214,0.0,2.0422,0.0,0.49,0.0,-9.6,0.0,1512.657,360.1975,0.0,True


In [332]:
w3_params[w3_params['VFID']=='VFID1520']

Unnamed: 0,VFID,CXC,CXC_ERR,CYC,CYC_ERR,CMAG,CMAG_ERR,CRE,CRE_ERR,CN,CN_ERR,CAR,CAR_ERR,CPA,CPA_ERR,CSKY,CSKY_ERR,CCHI2NU,CNumerical_Error
323,VFID1520,35.4102,0.2408,35.7984,0.306,9.931,0.0681,6.7641,6.3815,0.0495,0.1563,0.31,0.0,-38.61,0.0,12.141,4.6223,0.0,True


In [324]:
#False Positives
w3_params['VFID'][(pred_dec) & (~actual_dec)]

294    VFID1386
750    VFID4407
Name: VFID, dtype: object

In [330]:
w3_params[w3_params['VFID']=='VFID1386']

Unnamed: 0,VFID,CXC,CXC_ERR,CYC,CYC_ERR,CMAG,CMAG_ERR,CRE,CRE_ERR,CN,CN_ERR,CAR,CAR_ERR,CPA,CPA_ERR,CSKY,CSKY_ERR,CCHI2NU,CNumerical_Error
294,VFID1386,19.6609,0.1262,20.0331,0.1354,9.1157,1.0897,6.7486,16.7843,8.4831,10.2725,0.79,0.0,-65.72,0.0,5.9656,27.5554,0.0,False


In [329]:
w3_params[w3_params['VFID']=='VFID4407']

Unnamed: 0,VFID,CXC,CXC_ERR,CYC,CYC_ERR,CMAG,CMAG_ERR,CRE,CRE_ERR,CN,CN_ERR,CAR,CAR_ERR,CPA,CPA_ERR,CSKY,CSKY_ERR,CCHI2NU,CNumerical_Error
750,VFID4407,28.6593,18.0815,34.6691,64.1675,9.6956,5.61,6.5033,55.0734,0.8547,8.8682,0.01,0.0,0.0,0.0,1.2048,14.8712,0.0,False
