In [22]:
import h5py, numpy, sklearn.linear_model, sklearn.cross_validation, sklearn.metrics

In [35]:
with h5py.File('../data/dataset.h5') as f:
    raw_astro_features = f['features'][:, :4]
    dist_features = f['features'][:, 4]
    image_features = f['features'][:, 5:]
    
    w1_w2 = raw_astro_features[:, 0] - raw_astro_features[:, 1]
    w2_w3 = raw_astro_features[:, 1] - raw_astro_features[:, 2]
    lrblob = numpy.hypot(w2_w3 - 4.5, w1_w2 - 0.5)
    urblob = numpy.hypot(w2_w3 - 3.0, w1_w2 - 1.2)
    
    features_linear = f['features'][:]
    features_nonlinear = numpy.hstack([
            raw_astro_features,
            dist_features.reshape((-1, 1)),
            w1_w2.reshape((-1, 1)),
            w2_w3.reshape((-1, 1)),
            lrblob.reshape((-1, 1)),
            urblob.reshape((-1, 1)),
            image_features,
    ])
    labels = f['labels'].value

In [31]:
x_train, x_test, t_train, t_test = sklearn.cross_validation.train_test_split(
        numpy.arange(features.shape[0]), labels, test_size=0.2)

In [33]:
lr = sklearn.linear_model.LogisticRegression(C=100.0, class_weight='balanced')
lr.fit(features_linear[x_train], t_train)
cm = sklearn.metrics.confusion_matrix(t_test, lr.predict(features_linear[x_test]))
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print('Linear features, balanced accuracy: {:.02%}'.format(ba))

Linear features, balanced accuracy: 91.58%


In [37]:
lrnl = sklearn.linear_model.LogisticRegression(C=100.0, class_weight='balanced')
lrnl.fit(features_nonlinear[x_train], t_train)
cm = sklearn.metrics.confusion_matrix(t_test, lrnl.predict(features_nonlinear[x_test]))
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print('Nonlinear features, balanced accuracy: {:.02%}'.format(ba))
print(cm)

Nonlinear features, balanced accuracy: 91.32%
[[3953  416]
 [  36  423]]
