In [1]:
DATA_PATH = '../../data/CRTS2/'
RESULTS_PATH = '../../results/'

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Load Transient Catalog

In [3]:
filename = 'transient_catalog.pickle'
indir = DATA_PATH; filepath = indir + filename
df_cat = pd.read_pickle(filepath)
# Rename columns to match light curves
df_cat = df_cat.rename(columns={'TransientID': 'ID', 'Classification': 'class'})
print(df_cat.ID.unique().shape)

(5539,)


Load Feature Dataframes

In [4]:
# Use catalogue of transients with min observations
min_obs = 5
num_features = 27

In [5]:
# Loead transient features
indir = DATA_PATH
filename = 'transient_features_{}obs_{}feats.pickle'.format(min_obs, num_features) 
inpath = indir + filename
df_feat_tran = pd.read_pickle(inpath)
df_feat_tran.shape

(4384, 28)

Create inputs and outputs

In [6]:
# Add classs label to transient objects
df_feat_tran = df_feat_tran.merge(df_cat , how='inner')
# Remove ambiguous classes
top_classes = ['SN', 'CV', 'AGN', 'HPM', 'Blazar', 'Flare']
in_top = lambda row: ('Other' if row['class'] not in top_classes else row['class'])
df_feat_tran['class'] = df_feat_tran.apply( in_top , axis=1)
# Remove IDs
df = df_feat_tran.drop(['ID'], axis=1)

In [7]:
# Obtain X and y
X = df.drop(['class'], axis=1).as_matrix()
y = df['class'].as_matrix()

In [8]:
X.shape

(4384, 27)

In [9]:
# Count number of objects per class
dict(zip(*np.unique(y, return_counts=True)))

{'AGN': 427,
 'Blazar': 237,
 'CV': 862,
 'Flare': 207,
 'HPM': 412,
 'Other': 944,
 'SN': 1295}

In [10]:
# Count total number of objects
np.sum(np.unique(y, return_counts=True)[1])

4384

Split in Test & Train Sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

In [12]:
X_train.shape, y_train.shape

((2937, 27), (2937,))

In [13]:
X_test.shape, y_test.shape

((1447, 27), (1447,))

Scale features

In [14]:
# Create scaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Define classfication routine

In [15]:
def train_validate_test_model(model, params, X_train, y_train, X_test, y_test):
    # Precision decimal places
    digits = 4
    # Train & cross-validate
    grid_search = GridSearchCV(model, tuned_parameters, cv=StratifiedKFold(2))
    grid_search.fit(X_train, y_train)
    # Train new model with all train data
    clf = grid_search.best_estimator_
    clf.fit(X_train, y_train)
    # Predict test inputs with new model
    y_pred = clf.predict(X_test)
    # Create results using real and predicted labels of test data 
    results_str = results_string(y_test, y_pred, grid_search, digits=digits)
    print(results_str)
    # Save results
    task = 'binary'
    model_name = model.__class__.__name__
    filename = '{}_{}obs_{}feat_{}'.format(task, min_obs, num_features, model_name)
    with open(RESULTS_PATH + filename + '.txt', 'w+') as f: f.write(results_str)
        
    return clf

def results_string(y_true, y_pred, grid_search, digits):
    float_param = '{0:.' + str(digits) + 'f}'
    results = str()
    results += 'Best Params: {}\n'.format(grid_search.best_params_)
    results += ('Validation Accuracy: ' + float_param + '\n').format(grid_search.best_score_)
    results += ('Test Accuracy: ' + float_param + '\n').format(accuracy_score(y_test, y_pred))
    results += 'Report:\n {}'.format(classification_report(y_test, y_pred, digits=digits))
    return results

Classify using SVC

In [16]:
tuned_parameters = {
    'kernel': ['rbf'],
    'gamma':[1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    'C': [1, 10, 100, 1000]
}

In [17]:
model = SVC(random_state=0, class_weight='balanced')
clf1 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)

Best Params: {'gamma': 0.1, 'C': 10, 'kernel': 'rbf'}
Validation Accuracy: 0.5284
Test Accuracy: 0.5722
Report:
              precision    recall  f1-score   support

        AGN     0.4915    0.8227    0.6154       141
     Blazar     0.3786    0.5000    0.4309        78
         CV     0.6667    0.5895    0.6257       285
      Flare     0.4130    0.5588    0.4750        68
        HPM     0.9398    0.9191    0.9294       136
      Other     0.4693    0.4167    0.4414       312
         SN     0.5989    0.4965    0.5429       427

avg / total     0.5853    0.5722    0.5715      1447



Classify using RF

In [18]:
tuned_parameters = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [19]:
model = RandomForestClassifier(random_state=0, class_weight='balanced')
clf2 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)

Best Params: {'n_estimators': 200, 'max_features': 'log2'}
Validation Accuracy: 0.6394
Test Accuracy: 0.6531
Report:
              precision    recall  f1-score   support

        AGN     0.6806    0.6950    0.6877       141
     Blazar     0.5625    0.3462    0.4286        78
         CV     0.7500    0.6632    0.7039       285
      Flare     0.7879    0.3824    0.5149        68
        HPM     0.9481    0.9412    0.9446       136
      Other     0.5779    0.4519    0.5072       312
         SN     0.5685    0.7869    0.6601       427

avg / total     0.6629    0.6531    0.6459      1447



Classify using NN

In [20]:
tuned_parameters = {
    'learning_rate': ['constant', "adaptive"],
    'hidden_layer_sizes': [(100), (100,100)],
    'alpha': [1e-2, 1e-3],
    'activation': ["logistic", "relu", "tanh"]
}

In [21]:
model = MLPClassifier(random_state=0)
clf3 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)



Best Params: {'activation': 'tanh', 'hidden_layer_sizes': 100, 'alpha': 0.001, 'learning_rate': 'constant'}
Validation Accuracy: 0.5744
Test Accuracy: 0.6026
Report:
              precision    recall  f1-score   support

        AGN     0.5323    0.7021    0.6055       141
     Blazar     0.5532    0.3333    0.4160        78
         CV     0.6887    0.6211    0.6531       285
      Flare     0.6667    0.3824    0.4860        68
        HPM     0.9275    0.9412    0.9343       136
      Other     0.4903    0.4071    0.4448       312
         SN     0.5547    0.6768    0.6097       427

avg / total     0.6053    0.6026    0.5966      1447

