In [242]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
import seaborn as sns


In [382]:
features = pd.read_pickle("../train_features.pkl")
labels = pd.read_csv("../train_labels.csv",index_col="challenge_oid")
features.index.name = "challenge_oid"
features_original = features.join(labels)
features_original = features_original[~pd.isna(features_original["classALeRCE"])]

features_test = pd.read_pickle("../test_features.pkl")

In [383]:
dropped_objects = features_original[features_original["classALeRCE"].isin(["EB", "LPV","QSO", "RRL"])].sample(frac=0.9)
features = features_original.drop(dropped_objects.index)
features = features_original

In [384]:
#https://github.com/jrzaurin/LightGBM-with-Focal-Loss
import scipy
import scipy.misc
def focal_loss_lgb(y_pred, dtrain, alpha, gamma, num_class):
    """
    Focal Loss for lightgbm

    Parameters:
    -----------
    y_pred: numpy.ndarray
        array with the predictions
    dtrain: lightgbm.Dataset
    alpha, gamma: float
        See original paper https://arxiv.org/pdf/1708.02002.pdf
    num_class: int
        number of classes
    """
    a,g = alpha, gamma
    y_true = dtrain.label
    # N observations x num_class arrays
    y_true = np.eye(num_class)[y_true.astype('int')]
    y_pred = y_pred.reshape(-1,num_class, order='F')
    # alpha and gamma multiplicative factors with BCEWithLogitsLoss
    def fl(x,t):
        p = 1/(1+np.exp(-x))
        return -( a*t + (1-a)*(1-t) ) * (( 1 - ( t*p + (1-t)*(1-p)) )**g) * ( t*np.log(p)+(1-t)*np.log(1-p) )
    partial_fl = lambda x: fl(x, y_true)
    grad = scipy.misc.derivative(partial_fl, y_pred, n=1, dx=1e-6)
    hess = scipy.misc.derivative(partial_fl, y_pred, n=2, dx=1e-6)
    # flatten in column-major (Fortran-style) order
    return grad.flatten('F'), hess.flatten('F')

def focal_loss_lgb_eval_error(y_pred, dtrain, alpha, gamma, num_class):
    """
    Focal Loss for lightgbm

    Parameters:
    -----------
    y_pred: numpy.ndarray
        array with the predictions
    dtrain: lightgbm.Dataset
    alpha, gamma: float
        See original paper https://arxiv.org/pdf/1708.02002.pdf
    num_class: int
        number of classes
    """
    a,g = alpha, gamma
    y_true = dtrain.label
    y_true = np.eye(num_class)[y_true.astype('int')]
    y_pred = y_pred.reshape(-1, num_class, order='F')
    p = 1/(1+np.exp(-y_pred))
    loss = -( a*y_true + (1-a)*(1-y_true) ) * (( 1 - ( y_true*p + (1-y_true)*(1-p)) )**g) * ( y_true*np.log(p)+(1-y_true)*np.log(1-p) )
    # a variant can be np.sum(loss)/num_class
    return 'focal_loss', np.mean(loss), False

In [385]:
import sklearn.preprocessing
import sklearn.model_selection


label_encoder = sklearn.preprocessing.LabelEncoder()
encoded_labels = label_encoder.fit_transform(features["classALeRCE"])

#should split here into train and test!

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(features.drop("classALeRCE", axis=1),
                                                                            encoded_labels,
                                                                            test_size=0.2, random_state=0)
train_dataset = lgb.Dataset(X_train, 
                            label=y_train,
                            free_raw_data=True
                           )

test_dataset = lgb.Dataset(X_test, 
                            label=y_test,
                            free_raw_data=True
                           )

In [419]:
params = {
    'num_leaves' : 25,
    'min_data_in_leaf' : 200,
    'max_depth' : 30,
    'max_bin' : 60,
    'objective' : 'multiclass',
    'num_class' :14,
    'num_iterations' : 600,
    'learning_rate' : 0.01,
    'early_stopping_round' : 10,
    'num_threads' : 4,
    'feature_fraction' : 0.75,
    'bagging_fraction' : 0.75,
    'bagging_freq' : 10,
    'boosting' : 'dart',
}

In [None]:
focal_loss = lambda x,y: focal_loss_lgb(x, y, 0.25, 2., 14)
eval_error = lambda x,y: focal_loss_lgb_eval_error(x, y, 0.25, 2., 14)
model = lgb.train(params, train_dataset, valid_sets=[test_dataset], 
                  fobj=focal_loss,
                  feval=eval_error 
                 )

[1]	valid_0's focal_loss: 0.122254
[2]	valid_0's focal_loss: 0.120752
[3]	valid_0's focal_loss: 0.120751
[4]	valid_0's focal_loss: 0.119272
[5]	valid_0's focal_loss: 0.117817
[6]	valid_0's focal_loss: 0.116377
[7]	valid_0's focal_loss: 0.114959
[8]	valid_0's focal_loss: 0.115199
[9]	valid_0's focal_loss: 0.113797
[10]	valid_0's focal_loss: 0.112416
[11]	valid_0's focal_loss: 0.112194
[12]	valid_0's focal_loss: 0.112715
[13]	valid_0's focal_loss: 0.111349
[14]	valid_0's focal_loss: 0.111351
[15]	valid_0's focal_loss: 0.110004
[16]	valid_0's focal_loss: 0.108676
[17]	valid_0's focal_loss: 0.107367
[18]	valid_0's focal_loss: 0.106076
[19]	valid_0's focal_loss: 0.104807
[20]	valid_0's focal_loss: 0.103552
[21]	valid_0's focal_loss: 0.103877
[22]	valid_0's focal_loss: 0.102636
[23]	valid_0's focal_loss: 0.101411
[24]	valid_0's focal_loss: 0.100209
[25]	valid_0's focal_loss: 0.0990179
[26]	valid_0's focal_loss: 0.0978426
[27]	valid_0's focal_loss: 0.0966848
[28]	valid_0's focal_loss: 0.09723

[221]	valid_0's focal_loss: 0.0547767
[222]	valid_0's focal_loss: 0.0541892
[223]	valid_0's focal_loss: 0.0536092
[224]	valid_0's focal_loss: 0.0539443
[225]	valid_0's focal_loss: 0.0543156
[226]	valid_0's focal_loss: 0.0546135
[227]	valid_0's focal_loss: 0.0548554
[228]	valid_0's focal_loss: 0.0542652
[229]	valid_0's focal_loss: 0.0545563
[230]	valid_0's focal_loss: 0.0539707
[231]	valid_0's focal_loss: 0.0533911
[232]	valid_0's focal_loss: 0.0528207
[233]	valid_0's focal_loss: 0.0531037
[234]	valid_0's focal_loss: 0.0525368
[235]	valid_0's focal_loss: 0.0528371
[236]	valid_0's focal_loss: 0.0522726
[237]	valid_0's focal_loss: 0.0517162
[238]	valid_0's focal_loss: 0.0511667
[239]	valid_0's focal_loss: 0.0506237
[240]	valid_0's focal_loss: 0.0500874
[241]	valid_0's focal_loss: 0.0504202
[242]	valid_0's focal_loss: 0.0498862
[243]	valid_0's focal_loss: 0.0502055
[244]	valid_0's focal_loss: 0.0505148
[245]	valid_0's focal_loss: 0.0499788
[246]	valid_0's focal_loss: 0.0494506
[247]	valid_

[438]	valid_0's focal_loss: 0.0265919
[439]	valid_0's focal_loss: 0.0267491
[440]	valid_0's focal_loss: 0.0264999
[441]	valid_0's focal_loss: 0.0266586
[442]	valid_0's focal_loss: 0.0264097
[443]	valid_0's focal_loss: 0.0261638
[444]	valid_0's focal_loss: 0.0263235
[445]	valid_0's focal_loss: 0.0264834
[446]	valid_0's focal_loss: 0.0266329
[447]	valid_0's focal_loss: 0.0267913
[448]	valid_0's focal_loss: 0.0265409
[449]	valid_0's focal_loss: 0.026293
[450]	valid_0's focal_loss: 0.0260495
[451]	valid_0's focal_loss: 0.026198
[452]	valid_0's focal_loss: 0.0259559
[453]	valid_0's focal_loss: 0.0257158
[454]	valid_0's focal_loss: 0.0258695
[455]	valid_0's focal_loss: 0.0260222
[456]	valid_0's focal_loss: 0.0261796
[457]	valid_0's focal_loss: 0.0259366
[458]	valid_0's focal_loss: 0.0260912
[459]	valid_0's focal_loss: 0.0258498
[460]	valid_0's focal_loss: 0.0256107
[461]	valid_0's focal_loss: 0.0257637
[462]	valid_0's focal_loss: 0.0259071
[463]	valid_0's focal_loss: 0.0256687
[464]	valid_0'

In [None]:
def softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x / (np.sum(exp_x, axis=1, keepdims=True) + 1e-6)
prediction = softmax(model.predict(features_test))


In [None]:
test_df = pd.DataFrame(prediction, columns=label_encoder.classes_)
test_df["challenge_oid"] = features_test.index
test_df["Outlier"] = 0.

In [None]:
test_df.to_csv("gb.csv",  index=False)

In [None]:
#CODE FROM https://gist.github.com/shaypal5/94c53d765083101efc0240d776a23823
import seaborn as sns

def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14, normalize=True):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
        
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    if normalize:
        confusion_matrix = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
        
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    
    fmt = '.2f' if normalize else 'd'
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt=fmt)
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return fig

In [None]:
import sklearn.metrics
df_predict = softmax(model.predict(X_test))
cm = sklearn.metrics.confusion_matrix(y_test, 
                                      np.argmax(df_predict, axis=1),
                                      )

In [None]:
print_confusion_matrix(cm, label_encoder.classes_)

In [None]:
plt.bar(*np.unique(y_test, return_counts=True))