First test for redoing something like SNguess based on the Elasticc TabulatedRiseDecline features. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn import metrics
import xgboost as xgb

In [None]:
df = pd.read_csv('/home/jnordin/tmp/elasticc_feature_trainingset.csv')

In [None]:
# Should do this already in prep notebook
for c in ['bool_rise', 'bool_fall', 'bool_peaked', 'bool_pure',
       'bool_fastrise', 'bool_fastfall', 'bool_hasgaps']:
    df[c] = df[c].astype(bool)

In [None]:
df.columns

In [None]:
_ = plt.hist( df['ndet'], bins=30 )

In [None]:
# Suggested bins to use - actually look to have same number of entries in total?
#detrange = [1,1]
#detrange = [2,2]
detrange = [3,4]
#detrange = [5,6]
#detrange = [7,9]
#detrange = [10,14]
#detrange = [15,20]
#detrange = [21,30]
#detrange = [31,50]
#detrange = [51,75]
#detrange = [76,110]
#detrange = [111,200]

In [None]:
df_set = df[ (df['ndet']>=detrange[0]) & (df['ndet']<=detrange[1])]

In [None]:
df_set.class_aggregate

In [None]:
# Which set of labels to use?
#target = df_set.class_full
#target = df_set.class_intermediate
target = df_set.class_short - 1
#target = df_set.class_parsnip

# Or do the aggregate
# First step
#target = np.zeros(len(df_set))
#target[ (df_set.class_aggregate<4) ]=1
# Second step
#df_set = df_set[ (df_set.class_aggregate<4) ]
#target = np.zeros(len(df_set))
#target[ (df_set.class_aggregate<3) ]=1
# Third step
#df_set = df_set[ (df_set.class_aggregate<3) ]
#target = np.zeros(len(df_set))
#target[ (df_set.class_aggregate<2) ]=1


In [None]:
# Which columsn to use for training
use_cols = ['bool_rise', 'bool_fall', 'bool_peaked', 'bool_pure',
       'bool_fastrise', 'bool_fastfall', 'bool_hasgaps', 'mag_det',
       'mag_last', 'det_bands', 'peak_bands', 'last_bands', 't_predetect',
       't_lc', 't_rise', 't_fall', 'rise_slope_lsstu',
       'rise_slopesig_lsstu', 'fall_slope_lsstu', 'fall_slopesig_lsstu',
       'rise_slope_lsstg', 'rise_slopesig_lsstg', 'fall_slope_lsstg',
       'fall_slopesig_lsstg', 'rise_slope_lsstr', 'rise_slopesig_lsstr',
       'fall_slope_lsstr', 'fall_slopesig_lsstr', 'rise_slope_lssti',
       'rise_slopesig_lssti', 'fall_slope_lssti', 'fall_slopesig_lssti',
       'rise_slope_lsstz', 'rise_slopesig_lsstz', 'fall_slope_lsstz',
       'fall_slopesig_lsstz', 'rise_slope_lssty', 'rise_slopesig_lssty',
       'fall_slope_lssty', 'fall_slopesig_lssty', 'lsstu-lsstg_det',
       'lsstg-lsstr_det', 'lsstr-lssti_det', 'lssti-lsstz_det',
       'lsstz-lssty_det', 'lsstu-lsstg_peak', 'lsstg-lsstr_peak',
       'lsstr-lssti_peak', 'lssti-lsstz_peak', 'lsstz-lssty_peak',
       'lsstu-lsstg_last', 'lsstg-lsstr_last', 'lsstr-lssti_last',
       'lssti-lsstz_last', 'lsstz-lssty_last', 'host_sep', 'z', 'z_err',
       'band_det_id', 'band_last_id']


In [None]:
feats = df_set[use_cols]

In [None]:
plt.hist(target)

In [None]:
# Estimate scale_pos_weight to make up for class imbalance (negative_examples / positive_examples)
scale_pos_weight = (len(target) - np.sum(target)) / np.sum(target)

In [None]:
scale_pos_weight

In [None]:
model = xgb.XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    random_state=42,
    objective='binary:logistic')

In [None]:
param_grid = {
        'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
        'min_child_weight': np.arange(0.0001, 0.5, 0.001),
        'gamma': np.arange(0.0,40.0,0.005),
        'learning_rate': np.arange(0.0005,0.5,0.0005),
        'subsample': np.arange(0.01,1.0,0.01),
        'colsample_bylevel': np.round(np.arange(0.1,1.0,0.01)),
        'colsample_bytree': np.arange(0.1,1.0,0.01),
        }

kfold = StratifiedKFold(
    n_splits=5, 
    shuffle=True, 
    random_state=42)

grid_search = RandomizedSearchCV(
    model, 
    param_grid, 
    scoring=None, 
#     n_iter = 200,                                # Maximum number of iterations
    n_iter = 1,
    n_jobs=4, 
    cv=kfold, 
    random_state=42, 
    verbose=1, 
    error_score='raise')

In [None]:
grid_result = grid_search.fit(feats, target, eval_metric='aucpr')

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_[ 'mean_test_score' ]
stds = grid_result.cv_results_[ 'std_test_score' ]
params = grid_result.cv_results_[ 'params' ]

In [None]:
best_estimator = grid_result.best_estimator_
best_estimator

In [None]:
print('Evaluating model on the whole training sample:')
pred = best_estimator.predict(feats)
precision = metrics.precision_score(target, pred)
recall = metrics.recall_score(target, pred)
aucpr = metrics.average_precision_score(target, pred)
print("Precision: %.2f%%" % (precision * 100.0))
print("Recall: %.2f%%" % (recall * 100.0))
print("AUCPR: %.2f%%" % (aucpr * 100.0))

In [None]:
plt.figure(figsize=(10,20))
plt.barh(use_cols, best_estimator.feature_importances_)