In [1]:
DATA_PATH = '../../data/CRTS2/'

In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

Load Feature Dataframes

In [63]:
# Loead transient features
indir = DATA_PATH
filename = 'transient_features.pickle' 
inpath = indir + filename
df_feat_tran = pd.read_pickle(inpath)
df_feat_tran.shape

(4384, 20)

In [64]:
# Load permanent Features
indir = DATA_PATH
filename = 'permanent_features.pickle' 
inpath = indir + filename
df_feat_perm = pd.read_pickle(inpath)
df_feat_perm.shape

(4384, 20)

Create inputs and outputs

In [14]:
# Add output class '1' to transient objects
df_feat_tran['is_transient'] = 1
# Add output class '0' to permanent objects
df_feat_perm['is_transient'] = 0

In [16]:
# Merge dataframes
df = df_feat_tran.append(df_feat_perm, ignore_index=True)

In [18]:
# Remove IDs
df = df.drop(['ID'], axis=1)

In [68]:
# Obtain X and y
X = df.drop(['is_transient'], axis=1).as_matrix()
y = df['is_transient'].as_matrix()

Split in Test & Train Sets

In [103]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [104]:
X_train.shape, y_train.shape

((5874, 19), (5874,))

In [105]:
X_test.shape, y_test.shape

((2894, 19), (2894,))

In [106]:
# Create scaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Classify using SVC

In [119]:
clf = SVC(random_state=0)

In [130]:
pipe = make_pipeline(scaler, clf)
scores = cross_val_score(pipe, X, y, cv=10)
scores, scores.mean()

(array([ 0.8405467 ,  0.86446469,  0.82232346,  0.84396355,  0.85958904,
         0.81506849,  0.87442922,  0.83219178,  0.82534247,  0.8630137 ]),
 0.84409331086633177)

In [121]:
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [122]:
clf.score(X_test, y_test)

0.83655839668279197

Classify using RF

In [112]:
clf = RandomForestClassifier(max_depth=5, random_state=0)

In [131]:
pipe = make_pipeline(scaler, clf)
scores = cross_val_score(pipe, X, y, cv=10)
scores, scores.mean()

(array([ 0.83371298,  0.85876993,  0.82004556,  0.85193622,  0.85730594,
         0.81621005,  0.88127854,  0.82191781,  0.82648402,  0.84360731]),
 0.84112683454509507)

In [124]:
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [125]:
clf.score(X_test, y_test)

0.83655839668279197

Classify using NN

In [126]:
clf = MLPClassifier(alpha = 0.01, hidden_layer_sizes=(100, 5))

In [132]:
pipe = make_pipeline(scaler, clf)
scores = cross_val_score(pipe, X, y, cv=10)
scores, scores.mean()

(array([ 0.83599089,  0.87243736,  0.82118451,  0.85079727,  0.85502283,
         0.82077626,  0.89269406,  0.8196347 ,  0.81278539,  0.85273973]),
 0.84340629908155729)

In [128]:
clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 5), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [129]:
clf.score(X_test, y_test)

0.84174153420870768