In [1]:
DATA_PATH = '../../data/CRTS2/'
RESULTS_PATH = '../../results/'

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
np.random.seed(42)

Load Transient Catalog

In [4]:
filename = 'transient_catalog.pickle'
indir = DATA_PATH; filepath = indir + filename
df_cat = pd.read_pickle(filepath)
# Rename columns to match light curves
df_cat = df_cat.rename(columns={'TransientID': 'ID', 'Classification': 'class'})
print(df_cat.ID.unique().shape)

(5539,)


Load Transient Features

In [5]:
# Use catalogue of transients with min observations
min_obs = 5
num_features = 27

In [6]:
# Loead transient features
indir = DATA_PATH
filename = 'transient_features_{}obs_{}feats.pickle'.format(min_obs, num_features) 
inpath = indir + filename
df_feat_tran = pd.read_pickle(inpath)
df_feat_tran.shape

(4384, 28)

Load Non-Transient Features

In [7]:
# Load non-transient Features
indir = DATA_PATH
filename = 'permanent_features_{}obs_{}feats.pickle'.format(min_obs, num_features) 
inpath = indir + filename
df_feat_perm = pd.read_pickle(inpath)
df_feat_perm.shape

(4384, 28)

Create Transient inputs and outputs

In [8]:
# Add classs label to transient objects
df_feat_tran = df_feat_tran.merge(df_cat , how='inner')
# Remove ambiguous classes
top_classes = ['SN', 'CV', 'AGN', 'HPM', 'Blazar', 'Flare']
in_top = lambda row: ('Other' if row['class'] not in top_classes else row['class'])
df_feat_tran['class'] = df_feat_tran.apply( in_top , axis=1)
# Add class to non-transient features
df_feat_perm['class'] = 'Non-Transient'
# Sample non-transients features as big as largest class
big_class_size = df_feat_tran.groupby('class')['ID'].count().max()
IDs = np.random.choice(df_feat_perm.ID.unique(), size=big_class_size, replace=False)
df_feat_perm = df_feat_perm[df_feat_perm.ID.isin(IDs)]
# Merge transient and non-transient df
df = df_feat_tran.append(df_feat_perm, ignore_index=True)
# Remove IDs
df = df.drop(['ID'], axis=1)

Create Transient inputs and outputs

In [9]:
# Obtain X and y
X = df.drop(['class'], axis=1).as_matrix()
y = df['class'].as_matrix()

In [10]:
# Count number of objects per class
dict(zip(*np.unique(y, return_counts=True)))

{'AGN': 427,
 'Blazar': 237,
 'CV': 862,
 'Flare': 207,
 'HPM': 412,
 'Non-Transient': 1295,
 'Other': 944,
 'SN': 1295}

In [11]:
# Count total number of objects
np.sum(np.unique(y, return_counts=True)[1])

5679

Split in Test & Train Sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

In [13]:
X_train.shape, y_train.shape

((3804, 27), (3804,))

In [14]:
X_test.shape, y_test.shape

((1875, 27), (1875,))

Scale features

In [15]:
# Create scaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Define classfication routine

In [16]:
def train_validate_test_model(model, params, X_train, y_train, X_test, y_test):
    # Precision decimal places
    digits = 4
    # Train & cross-validate
    grid_search = GridSearchCV(model, tuned_parameters, cv=StratifiedKFold(2))
    grid_search.fit(X_train, y_train)
    # Train new model with all train data
    clf = grid_search.best_estimator_
    clf.fit(X_train, y_train)
    # Predict test inputs with new model
    y_pred = clf.predict(X_test)
    # Create results using real and predicted labels of test data 
    results_str = results_string(y_test, y_pred, grid_search, digits=digits)
    print(results_str)
    # Save results
    task = 'binary'
    model_name = model.__class__.__name__
    filename = '{}_{}obs_{}feat_{}'.format(task, min_obs, num_features, model_name)
    with open(RESULTS_PATH + filename + '.txt', 'w+') as f: f.write(results_str)
        
    return clf

def results_string(y_true, y_pred, grid_search, digits):
    float_param = '{0:.' + str(digits) + 'f}'
    results = str()
    results += 'Best Params: {}\n'.format(grid_search.best_params_)
    results += ('Validation Accuracy: ' + float_param + '\n').format(grid_search.best_score_)
    results += ('Test Accuracy: ' + float_param + '\n').format(accuracy_score(y_test, y_pred))
    results += 'Report:\n {}'.format(classification_report(y_test, y_pred, digits=digits))
#    results += 'Confusion Matrix:\n {}'.format(clf_confusion_matrix(y_pred, y_test))
#    results += 'Normalized Confusion Matrix:\n {}'.format(clf_confusion_matrix(y_pred, y_test, True))   
    return results

Classify using SVC

In [17]:
tuned_parameters = {
    'kernel': ['rbf'],
    'gamma':[1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    'C': [1, 10, 100, 1000]
}

In [18]:
model = SVC(random_state=0, class_weight='balanced')
clf1 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)

Best Params: {'kernel': 'rbf', 'gamma': 0.01, 'C': 1000}
Validation Accuracy: 0.5187
Test Accuracy: 0.5477
Report:
                precision    recall  f1-score   support

          AGN     0.4464    0.7376    0.5561       141
       Blazar     0.3206    0.5385    0.4019        78
           CV     0.6284    0.5754    0.6007       285
        Flare     0.2667    0.5294    0.3547        68
          HPM     0.7600    0.8382    0.7972       136
Non-Transient     0.6878    0.6332    0.6594       428
        Other     0.4377    0.3718    0.4021       312
           SN     0.5882    0.4215    0.4911       427

  avg / total     0.5710    0.5477    0.5498      1875



Classify using RF

In [19]:
tuned_parameters = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [20]:
model = RandomForestClassifier(random_state=0, class_weight='balanced')
clf2 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)

Best Params: {'max_features': 'log2', 'n_estimators': 200}
Validation Accuracy: 0.6288
Test Accuracy: 0.6416
Report:
                precision    recall  f1-score   support

          AGN     0.6438    0.6667    0.6551       141
       Blazar     0.5098    0.3333    0.4031        78
           CV     0.6953    0.6807    0.6879       285
        Flare     0.7308    0.2794    0.4043        68
          HPM     0.8855    0.8529    0.8689       136
Non-Transient     0.6848    0.8528    0.7596       428
        Other     0.5489    0.4135    0.4717       312
           SN     0.5485    0.6089    0.5771       427

  avg / total     0.6386    0.6416    0.6316      1875



Classify using NN

In [21]:
tuned_parameters = {
    'learning_rate': ['constant', "adaptive"],
    'hidden_layer_sizes': [(100), (100,100)],
    'alpha': [1e-2, 1e-3],
    'activation': ["logistic", "relu", "tanh"]
}

In [22]:
model = MLPClassifier(random_state=0)
clf3 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)



Best Params: {'activation': 'tanh', 'alpha': 0.001, 'learning_rate': 'constant', 'hidden_layer_sizes': 100}
Validation Accuracy: 0.5762
Test Accuracy: 0.5760
Report:
                precision    recall  f1-score   support

          AGN     0.5235    0.6312    0.5723       141
       Blazar     0.4359    0.2179    0.2906        78
           CV     0.6437    0.5895    0.6154       285
        Flare     0.5152    0.2500    0.3366        68
          HPM     0.8357    0.8603    0.8478       136
Non-Transient     0.6173    0.8178    0.7035       428
        Other     0.4891    0.2885    0.3629       312
           SN     0.4823    0.5433    0.5110       427

  avg / total     0.5668    0.5760    0.5597      1875

