In [1]:
DATA_PATH = '../../data/CRTS2/'
RESULTS_PATH = '../../results/'

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
np.random.seed(42)

Load Transient Catalog

In [4]:
filename = 'transient_catalog.pickle'
indir = DATA_PATH; filepath = indir + filename
df_cat = pd.read_pickle(filepath)
# Rename columns to match light curves
df_cat = df_cat.rename(columns={'TransientID': 'ID', 'Classification': 'class'})
print(df_cat.ID.unique().shape)

(5539,)


Load Transient Features

In [5]:
# Use catalogue of transients with min observations
min_obs = 5
num_features = 27

In [6]:
# Loead transient features
indir = DATA_PATH
filename = 'transient_features_{}obs_{}feats.pickle'.format(min_obs, num_features) 
inpath = indir + filename
df_feat_tran = pd.read_pickle(inpath)
df_feat_tran.shape

(4384, 28)

Load Non-Transient Features

In [7]:
# Load non-transient Features
indir = DATA_PATH
filename = 'permanent_features_{}obs_{}feats.pickle'.format(min_obs, num_features) 
inpath = indir + filename
df_feat_perm = pd.read_pickle(inpath)
df_feat_perm.shape

(4384, 28)

Create Transient inputs and outputs

In [8]:
# Add classs label to transient objects
df_feat_tran = df_feat_tran.merge(df_cat , how='inner')
# Remove ambiguous classes
top_classes = ['SN', 'CV', 'AGN', 'HPM', 'Blazar', 'Flare']
df_feat_tran = df_feat_tran[df_feat_tran['class'].isin(top_classes)]
# Add class to non-transient features
df_feat_perm['class'] = 'Non-Transient'
# Sample non-transients features as big as largest class
big_class_size = df_feat_tran.groupby('class')['ID'].count().max()
IDs = np.random.choice(df_feat_perm.ID.unique(), size=big_class_size, replace=False)
df_feat_perm = df_feat_perm[df_feat_perm.ID.isin(IDs)]
# Merge transient and non-transient df
df = df_feat_tran.append(df_feat_perm, ignore_index=True)
# Remove IDs
df = df.drop(['ID'], axis=1)

Create Transient inputs and outputs

In [9]:
# Obtain X and y
X = df.drop(['class'], axis=1).as_matrix()
y = df['class'].as_matrix()

In [10]:
# Count number of objects per class
dict(zip(*np.unique(y, return_counts=True)))

{'AGN': 427,
 'Blazar': 237,
 'CV': 862,
 'Flare': 207,
 'HPM': 412,
 'Non-Transient': 1295,
 'SN': 1295}

In [11]:
# Count total number of objects
np.sum(np.unique(y, return_counts=True)[1])

4735

Split in Test & Train Sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

In [13]:
X_train.shape, y_train.shape

((3172, 27), (3172,))

In [14]:
X_test.shape, y_test.shape

((1563, 27), (1563,))

Scale features

In [15]:
# Create scaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Define classfication routine

In [16]:
def train_validate_test_model(model, params, X_train, y_train, X_test, y_test):
    # Precision decimal places
    digits = 4
    # Train & cross-validate
    grid_search = GridSearchCV(model, tuned_parameters, cv=StratifiedKFold(2))
    grid_search.fit(X_train, y_train)
    # Train new model with all train data
    clf = grid_search.best_estimator_
    clf.fit(X_train, y_train)
    # Predict test inputs with new model
    y_pred = clf.predict(X_test)
    # Create results using real and predicted labels of test data 
    results_str = results_string(y_test, y_pred, grid_search, digits=digits)
    print(results_str)
    # Save results
    task = 'binary'
    model_name = model.__class__.__name__
    filename = '{}_{}obs_{}feat_{}'.format(task, min_obs, num_features, model_name)
    with open(RESULTS_PATH + filename + '.txt', 'w+') as f: f.write(results_str)
        
    return clf

def results_string(y_true, y_pred, grid_search, digits):
    float_param = '{0:.' + str(digits) + 'f}'
    results = str()
    results += 'Best Params: {}\n'.format(grid_search.best_params_)
    results += ('Validation Accuracy: ' + float_param + '\n').format(grid_search.best_score_)
    results += ('Test Accuracy: ' + float_param + '\n').format(accuracy_score(y_test, y_pred))
    results += 'Report:\n {}'.format(classification_report(y_test, y_pred, digits=digits))
#    results += 'Confusion Matrix:\n {}'.format(clf_confusion_matrix(y_pred, y_test))
#    results += 'Normalized Confusion Matrix:\n {}'.format(clf_confusion_matrix(y_pred, y_test, True))   
    return results

Classify using SVC

In [17]:
tuned_parameters = {
    'kernel': ['rbf'],
    'gamma':[1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    'C': [1, 10, 100, 1000]
}

In [18]:
model = SVC(random_state=0, class_weight='balanced')
clf1 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)

Best Params: {'C': 100, 'kernel': 'rbf', 'gamma': 0.01}
Validation Accuracy: 0.6176
Test Accuracy: 0.6468
Report:
                precision    recall  f1-score   support

          AGN     0.5700    0.8085    0.6686       141
       Blazar     0.3651    0.5897    0.4510        78
           CV     0.7194    0.6386    0.6766       285
        Flare     0.2867    0.6029    0.3886        68
          HPM     0.7891    0.8529    0.8198       136
Non-Transient     0.7386    0.6799    0.7080       428
           SN     0.7367    0.5176    0.6080       427

  avg / total     0.6854    0.6468    0.6544      1563



Classify using RF

In [19]:
tuned_parameters = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [20]:
model = RandomForestClassifier(random_state=0, class_weight='balanced')
clf2 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)

Best Params: {'max_features': 'log2', 'n_estimators': 200}
Validation Accuracy: 0.7226
Test Accuracy: 0.7524
Report:
                precision    recall  f1-score   support

          AGN     0.8308    0.7660    0.7970       141
       Blazar     0.5833    0.3590    0.4444        78
           CV     0.7874    0.7018    0.7421       285
        Flare     0.7333    0.3235    0.4490        68
          HPM     0.9512    0.8603    0.9035       136
Non-Transient     0.7320    0.8808    0.7996       428
           SN     0.6998    0.7588    0.7281       427

  avg / total     0.7539    0.7524    0.7454      1563



Classify using NN

In [21]:
tuned_parameters = {
    'learning_rate': ['constant', "adaptive"],
    'hidden_layer_sizes': [(100), (100,100)],
    'alpha': [1e-2, 1e-3],
    'activation': ["logistic", "relu", "tanh"]
}

In [22]:
model = MLPClassifier(random_state=0)
clf3 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)



Best Params: {'alpha': 0.01, 'activation': 'relu', 'learning_rate': 'constant', 'hidden_layer_sizes': 100}
Validation Accuracy: 0.6690
Test Accuracy: 0.6884
Report:
                precision    recall  f1-score   support

          AGN     0.6646    0.7447    0.7023       141
       Blazar     0.5263    0.3846    0.4444        78
           CV     0.7019    0.6526    0.6764       285
        Flare     0.5385    0.3088    0.3925        68
          HPM     0.8750    0.8235    0.8485       136
Non-Transient     0.7012    0.7897    0.7429       428
           SN     0.6544    0.6651    0.6597       427

  avg / total     0.6846    0.6884    0.6834      1563

