In [None]:
DATA_PATH = '../data/'
RESULTS_PATH = '../results/'
FEATURES_PATH = DATA_PATH + 'features/'

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Load Transient Catalog

In [None]:
filename = 'transient_catalog.pickle'
indir = DATA_PATH; filepath = indir + filename
df_cat = pd.read_pickle(filepath)
# Rename columns to match light curves
df_cat = df_cat.rename(columns={'TransientID': 'ID', 'Classification': 'class'})
print(df_cat.ID.unique().shape)

Load Feature Dataframes

In [None]:
# Use catalogue of transients with min observations
min_obs = 5
num_features = 27

In [None]:
# Loead transient features
indir = FEATURES_PATH
filename = 'transient_features_{}obs_{}feats.pickle'.format(min_obs, num_features)  
inpath = indir + filename
df_feat_tran = pd.read_pickle(inpath)
df_feat_tran.shape

Create inputs and outputs

In [None]:
# Add classs label to transient objects
df_feat_tran = df_feat_tran.merge(df_cat , how='inner')
# Remove ambiguous classes
top_classes = ['SN', 'CV', 'AGN', 'HPM', 'Blazar', 'Flare']
in_top = lambda row: ('Other' if row['class'] not in top_classes else row['class'])
df_feat_tran['class'] = df_feat_tran.apply( in_top , axis=1)
# Remove IDs
df = df_feat_tran.drop(['ID'], axis=1)

In [None]:
# Obtain X and y
X = df.drop(['class'], axis=1).as_matrix()
y = df['class'].as_matrix()

In [None]:
X.shape

In [None]:
# Count number of objects per class
dict(zip(*np.unique(y, return_counts=True)))

In [None]:
# Count total number of objects
np.sum(np.unique(y, return_counts=True)[1])

Split in Test & Train Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

# Pre-processing

Scale features

In [None]:
# Create scaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Classification

Define classfication routine

In [None]:
def train_validate_test_model(model, params, X_train, y_train, X_test, y_test):
    # Precision decimal places
    digits = 4
    # Train & cross-validate
    grid_search = GridSearchCV(model, tuned_parameters, cv=StratifiedKFold(2))
    grid_search.fit(X_train, y_train)
    # Train new model with all train data
    clf = grid_search.best_estimator_
    clf.fit(X_train, y_train)
    # Predict test inputs with new model
    y_pred = clf.predict(X_test)
    # Create results using real and predicted labels of test data 
    results_str = results_string(y_test, y_pred, grid_search, digits=digits)
    print(results_str)
    # Save results
    task = 'binary'
    model_name = model.__class__.__name__
    filename = '{}_{}obs_{}feat_{}'.format(task, min_obs, num_features, model_name)
    with open(RESULTS_PATH + filename + '.txt', 'w+') as f: f.write(results_str)
        
    return clf

def results_string(y_true, y_pred, grid_search, digits):
    float_param = '{0:.' + str(digits) + 'f}'
    results = str()
    results += 'Best Params: {}\n'.format(grid_search.best_params_)
    results += ('Validation Accuracy: ' + float_param + '\n').format(grid_search.best_score_)
    results += ('Test Accuracy: ' + float_param + '\n').format(accuracy_score(y_test, y_pred))
    results += 'Report:\n {}'.format(classification_report(y_test, y_pred, digits=digits))
    return results

Classify using SVC

In [None]:
tuned_parameters = {
    'kernel': ['rbf'],
    'gamma':[1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    'C': [1, 10, 100, 1000]
}

In [None]:
model = SVC(random_state=0, class_weight='balanced')
clf1 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)

Classify using RF

In [None]:
tuned_parameters = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [None]:
model = RandomForestClassifier(random_state=0, class_weight='balanced')
clf2 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)

Classify using NN

In [None]:
tuned_parameters = {
    'learning_rate': ['constant', "adaptive"],
    'hidden_layer_sizes': [(100), (100,100)],
    'alpha': [1e-2, 1e-3],
    'activation': ["logistic", "relu", "tanh"]
}

In [None]:
model = MLPClassifier(random_state=0)
clf3 = train_validate_test_model(model, tuned_parameters, X_train, y_train, X_test, y_test)