In [3]:
DATA_PATH = '../../data/CRTS2/'

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

Load Feature Dataframes

In [5]:
# Loead transient features
indir = DATA_PATH
filename = 'transient_features.pickle' 
inpath = indir + filename
df_feat_tran = pd.read_pickle(inpath)
df_feat_tran.shape

(4384, 20)

In [6]:
# Load permanent Features
indir = DATA_PATH
filename = 'permanent_features.pickle' 
inpath = indir + filename
df_feat_perm = pd.read_pickle(inpath)
df_feat_perm.shape

(4384, 20)

Create inputs and outputs

In [7]:
# Add output class '1' to transient objects
df_feat_tran['is_transient'] = 1
# Add output class '0' to permanent objects
df_feat_perm['is_transient'] = 0

In [8]:
# Merge dataframes
df = df_feat_tran.append(df_feat_perm, ignore_index=True)

In [9]:
# Remove IDs
df = df.drop(['ID'], axis=1)

In [10]:
# Obtain X and y
X = df.drop(['is_transient'], axis=1).as_matrix()
y = df['is_transient'].as_matrix()

Split in Test, Validation & Train Sets

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.2, random_state=14
)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.25, random_state=68
)

In [23]:
X_train.shape, y_train.shape

((5260, 19), (5260,))

In [24]:
X_valid.shape, y_valid.shape

((1754, 19), (1754,))

In [25]:
X_test.shape, y_test.shape

((1754, 19), (1754,))

In [26]:
# Create scaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

Classify using SVC

In [27]:
clf = SVC(random_state=0)

In [28]:
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [29]:
clf.score(X_valid, y_valid)

0.83010262257696699

In [30]:
clf.score(X_test, y_test)

0.8472063854047891

Classify using RF

In [31]:
clf = RandomForestClassifier(max_depth=5, random_state=0)

In [32]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [33]:
clf.score(X_valid, y_valid)

0.82782212086659068

In [34]:
clf.score(X_test, y_test)

0.81584948688711512

Classify using NN

In [35]:
clf = MLPClassifier(alpha = 0.01, hidden_layer_sizes=(100, 5))

In [36]:
clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 5), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [38]:
clf.score(X_valid, y_valid)

0.84150513112884839

In [39]:
clf.score(X_test, y_test)

0.84378563283922459