In [1]:
DATA_PATH = '../../data/CRTS2/'
RESULTS_PATH = '../../results/'

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Load Transient Catalog

In [3]:
filename = 'transient_catalog.pickle'
indir = DATA_PATH; filepath = indir + filename
df_cat = pd.read_pickle(filepath)
# Rename columns to match light curves
df_cat = df_cat.rename(columns={'TransientID': 'ID', 'Classification': 'class'})
print(df_cat.ID.unique().shape)

(5539,)


Load Feature Dataframes

In [4]:
# Use catalogue of transients with min observations
min_obs = 5
num_features = 27

In [5]:
# Loead transient features
indir = DATA_PATH
filename = 'transient_features_{}obs_{}feats.pickle'.format(min_obs, num_features) 
inpath = indir + filename
df_feat_tran = pd.read_pickle(inpath)
df_feat_tran.shape

(4384, 28)

Create inputs and outputs

In [6]:
# Add classs label to transient objects
df_feat_tran = df_feat_tran.merge(df_cat , how='inner')
# Remove ambiguous classes
top_classes = ['SN', 'CV', 'AGN', 'HPM', 'Blazar', 'Flare']
df_feat_tran = df_feat_tran[df_feat_tran['class'].isin(top_classes)]
# Remove IDs
df = df_feat_tran.drop(['ID'], axis=1)

In [7]:
# Obtain X and y
X = df.drop(['class'], axis=1).as_matrix()
y = df['class'].as_matrix()

In [8]:
X.shape

(3440, 27)

In [9]:
# Count number of objects per class
dict(zip(*np.unique(y, return_counts=True)))

{'AGN': 427, 'Blazar': 237, 'CV': 862, 'Flare': 207, 'HPM': 412, 'SN': 1295}

In [10]:
# Count total number of objects
np.sum(np.unique(y, return_counts=True)[1])

3440

Split in Test & Train Sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

In [12]:
X_train.shape, y_train.shape

((2304, 27), (2304,))

In [13]:
X_test.shape, y_test.shape

((1136, 27), (1136,))

Scale features

In [14]:
# Create scaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Define classfication routine

In [15]:
def train_validate_test_model(model, params, X_train, y_train, X_test, y_test):
    # Precision decimal places
    digits = 4
    # Train & cross-validate
    grid_search = GridSearchCV(model, tuned_parameters, cv=StratifiedKFold(2))
    grid_search.fit(X_train, y_train)
    # Train new model with all train data
    clf = grid_search.best_estimator_
    clf.fit(X_train, y_train)
    # Predict test inputs with new model
    y_pred = clf.predict(X_test)
    # Create results using real and predicted labels of test data 
    results_str = results_string(y_test, y_pred, grid_search, digits=digits)
    print(results_str)
    # Save results
    task = 'binary'
    model_name = model.__class__.__name__
    filename = '{}_{}obs_{}feat_{}'.format(task, min_obs, num_features, model_name)
    with open(RESULTS_PATH + filename + '.txt', 'w+') as f: f.write(results_str)
        
    return clf

def results_string(y_true, y_pred, grid_search, digits):
    float_param = '{0:.' + str(digits) + 'f}'
    results = str()
    results += 'Best Params: {}\n'.format(grid_search.best_params_)
    results += ('Validation Accuracy: ' + float_param + '\n').format(grid_search.best_score_)
    results += ('Test Accuracy: ' + float_param + '\n').format(accuracy_score(y_test, y_pred))
    results += 'Report:\n {}'.format(classification_report(y_test, y_pred, digits=digits))
    return results

Classify using SVC

In [16]:
tuned_parameters = {
    'kernel': ['rbf'],
    'gamma':[1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    'C': [1, 10, 100, 1000]
}

In [17]:
model = SVC(random_state=0, class_weight='balanced')
clf1 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)

Best Params: {'gamma': 0.1, 'kernel': 'rbf', 'C': 10}
Validation Accuracy: 0.6736
Test Accuracy: 0.6831
Report:
              precision    recall  f1-score   support

        AGN     0.6122    0.8511    0.7122       141
     Blazar     0.4059    0.5256    0.4581        78
         CV     0.7510    0.6351    0.6882       285
      Flare     0.4457    0.6029    0.5125        68
        HPM     0.9542    0.9191    0.9363       136
         SN     0.7147    0.6262    0.6675       428

avg / total     0.7025    0.6831    0.6868      1136



Classify using RF

In [18]:
tuned_parameters = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [19]:
model = RandomForestClassifier(random_state=0, class_weight='balanced')
clf2 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)

Best Params: {'max_features': 'auto', 'n_estimators': 200}
Validation Accuracy: 0.7648
Test Accuracy: 0.7738
Report:
              precision    recall  f1-score   support

        AGN     0.8309    0.8014    0.8159       141
     Blazar     0.6383    0.3846    0.4800        78
         CV     0.8257    0.6982    0.7567       285
      Flare     0.6744    0.4265    0.5225        68
        HPM     0.9489    0.9559    0.9524       136
         SN     0.7105    0.8832    0.7875       428

avg / total     0.7758    0.7738    0.7660      1136



Classify using NN

In [20]:
tuned_parameters = {
    'learning_rate': ['constant', "adaptive"],
    'hidden_layer_sizes': [(100), (100,100)],
    'alpha': [1e-2, 1e-3],
    'activation': ["logistic", "relu", "tanh"]
}

In [21]:
model = MLPClassifier(random_state=0)
clf3 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)



Best Params: {'activation': 'relu', 'alpha': 0.01, 'learning_rate': 'constant', 'hidden_layer_sizes': (100, 100)}
Validation Accuracy: 0.7053
Test Accuracy: 0.7060
Report:
              precision    recall  f1-score   support

        AGN     0.6707    0.7943    0.7273       141
     Blazar     0.5410    0.4231    0.4748        78
         CV     0.6863    0.6526    0.6691       285
      Flare     0.5652    0.3824    0.4561        68
        HPM     0.9353    0.9559    0.9455       136
         SN     0.6969    0.7360    0.7159       428

avg / total     0.7009    0.7060    0.7009      1136

