In [1]:
DATA_PATH = '../../data/CRTS2/'
RESULTS_PATH = '../../results/'

In [2]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix

Load Transient Catalogue

In [3]:
# Use catalogue of transients with min observations
min_obs = 5
num_features = 28

In [4]:
filename = 'transient_catalog.pickle'
indir = DATA_PATH; filepath = indir + filename
df_cat = pd.read_pickle(filepath)
print(df_cat.TransientID.unique().shape)

(5539,)


In [5]:
df_cat = df_cat.rename(columns={'TransientID': 'ID'})
df_cat = df_cat.rename(columns={'Classification': 'class'})

Load Transient Feature Dataframe

In [6]:
from sklearn import datasets
import numpy as np
iris = datasets.load_iris()

In [7]:
# Loead transient features
indir = DATA_PATH
filename = 'transient_features_{}obs_{}feats.pickle'.format(min_obs, num_features) 
inpath = indir + filename
df_feat_tran = pd.read_pickle(inpath)
df_feat_tran.shape

(3873, 29)

Create inputs and outputs

In [8]:
# Add classs label to transient objects
df_feat_tran = df_feat_tran.merge(df_cat , how='inner')
# Set least common transients as 'Other'
top_classes = ['SN', 'CV', 'AGN', 'HPM', 'Blazar', 'Flare']
in_top = lambda row: ('Other' if row['class'] not in top_classes else row['class'])
df_feat_tran['class'] = df_feat_tran.apply( in_top , axis=1)

In [9]:
all_labels = df_feat_tran['class'].unique()
all_labels

array(['CV', 'Flare', 'SN', 'AGN', 'Other', 'Blazar', 'HPM'], dtype=object)

In [28]:
# Count number of objects per class
df_feat_tran.groupby('class', as_index=False).count()[['class', 'ID']].transpose()

Unnamed: 0,0,1,2,3,4,5,6
class,AGN,Blazar,CV,Flare,HPM,Other,SN
ID,362,195,726,157,409,843,1182


In [10]:
# Remove IDs
df = df_feat_tran.drop(['ID'], axis=1)

In [11]:
# Obtain X and y
X = df.drop(['class'], axis=1).as_matrix()
y = df['class'].as_matrix()

Split in Test & Train Sets

In [12]:
df['class'].unique().tolist()

['CV', 'Flare', 'SN', 'AGN', 'Other', 'Blazar', 'HPM']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

In [14]:
X_train.shape, y_train.shape

((2595, 28), (2595,))

In [15]:
X_test.shape, y_test.shape

((1279, 28), (1279,))

In [16]:
# Compute class weights
weight_list = compute_class_weight('balanced', df['class'].unique(), df['class'])
class_weights = dict()
for i, cla in enumerate(df['class'].unique()):
    class_weights[cla] = weight_list[i]

In [17]:
class_weights

{'AGN': 1.5288082083662193,
 'Blazar': 2.8380952380952382,
 'CV': 0.76229830775285323,
 'Flare': 3.5250227479526841,
 'HPM': 1.3531260915123995,
 'Other': 0.656498898491781,
 'SN': 0.46821368141165093}

In [18]:
# Create scaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
def train_validate_model(model, params, X_train, y_train, X_test, y_test):
    clf = GridSearchCV(model, tuned_parameters, cv=StratifiedKFold(2))
    clf.fit(X_train, y_train)
    print('Best Score:', clf.best_score_)
    print('Best Estimator:', clf.best_estimator_)
    print('Test Score:', clf.score(X_test, y_test))
    print('Confusion Matrix:', clf.score(X_test, y_test))
    return clf

def clf_confusion_matrix(clf, X_test, y_test, normalized=False):
    y_pred = clf.predict(X_test)
    cnf_matrix = confusion_matrix(y_test, y_pred, all_labels)
    if normalized:
        cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
        cnf_matrix = np.around(cnf_matrix, decimals=4) * 100
    return cnf_matrix

Classify using SVC

In [20]:
tuned_parameters = {
    'kernel': ['rbf'],
    'gamma':[1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    'C': [1, 10, 100, 1000],
    'class_weight' : [class_weights]
}

In [21]:
model = SVC()
clf1 = train_validate_model(model, tuned_parameters, X_train, y_train, X_test, y_test)

Best Score: 0.528323699422
Best Estimator: SVC(C=10, cache_size=200,
  class_weight={'Blazar': 2.8380952380952382, 'AGN': 1.5288082083662193, 'HPM': 1.3531260915123995, 'Flare': 3.5250227479526841, 'Other': 0.656498898491781, 'SN': 0.46821368141165093, 'CV': 0.76229830775285323},
  coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.1,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
Test Score: 0.571540265833
Confusion Matrix: 0.571540265833


Classify using RF

In [22]:
tuned_parameters = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2'],
    'class_weight' : [class_weights]

}

In [23]:
model = RandomForestClassifier()
clf2 = train_validate_model(model, tuned_parameters, X_train, y_train, X_test, y_test)

Best Score: 0.612331406551
Best Estimator: RandomForestClassifier(bootstrap=True,
            class_weight={'Blazar': 2.8380952380952382, 'AGN': 1.5288082083662193, 'HPM': 1.3531260915123995, 'Flare': 3.5250227479526841, 'Other': 0.656498898491781, 'SN': 0.46821368141165093, 'CV': 0.76229830775285323},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=700, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Test Score: 0.65598123534
Confusion Matrix: 0.65598123534


Classify using NN

In [24]:
tuned_parameters = {
    'learning_rate': ['constant', "adaptive"],
    'hidden_layer_sizes': [(100), (100,100)],
    'alpha': [1e-2, 1e-3, 1e-4],
    'activation': ["logistic", "relu", "tanh"]
}

In [25]:
model = MLPClassifier()
clf3 = train_validate_model(model, tuned_parameters, X_train, y_train, X_test, y_test)



Best Score: 0.580732177264
Best Estimator: MLPClassifier(activation='tanh', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=100, learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
Test Score: 0.612978889758
Confusion Matrix: 0.612978889758


In [26]:
def write_results(filepath):
    with open(filepath, 'a') as file:
        file.write('----- EXPERIMENT START ------\n')
        file.write('NUM OBSERVATIONS:{}\n'.format(min_obs))
        file.write('NUM FEATURES:{}\n'.format(X_train.shape[1]))
        file.write('FEATURES:{}\n'.format(df.columns.values))
        file.write('--SVM--\n')
        file.write('Validation Score:{}\n'.format(clf1.best_score_))
        file.write('Best Estimator:{}\n'.format(clf1.best_estimator_))
        file.write('Test Score:{}\n'.format(clf1.score(X_test, y_test)))
        file.write('Confusion Matrix:\n{}\n'.format(clf_confusion_matrix(clf1, X_test, y_test)))
        file.write('Norm. Confusion Matrix:\n{}\n{}\n'.format(all_labels, clf_confusion_matrix(clf1, X_test, y_test, True)))
        file.write('--Random Forest--\n')
        file.write('Best Score:{}\n'.format(clf2.best_score_))
        file.write('Best Estimator:{}\n'.format(clf2.best_estimator_))
        file.write('Test Score:{}\n'.format(clf2.score(X_test, y_test)))
        file.write('Confusion Matrix:\n{}\n'.format(clf_confusion_matrix(clf2, X_test, y_test)))
        file.write('Norm. Confusion Matrix:\n{}\n{}\n'.format(all_labels, clf_confusion_matrix(clf2, X_test, y_test, True)))
        file.write('--Neural Network--\n')
        file.write('Best Score:{}\n'.format(clf3.best_score_))
        file.write('Best Estimator:{}\n'.format(clf3.best_estimator_))
        file.write('Test Score:{}\n'.format(clf3.score(X_test, y_test)))
        file.write('Confusion Matrix:\n{}\n'.format(clf_confusion_matrix(clf3, X_test, y_test)))
        file.write('Norm. Confusion Matrix:\n{}\n{}\n'.format(all_labels, clf_confusion_matrix(clf3, X_test, y_test, True)))

In [27]:
filename = '7-class.pickle'
outdir = RESULTS_PATH; filepath = outdir + filename
write_results(filepath)