In [51]:
import numpy as np
import cPickle
import utils

In [10]:
# For repeatability
random_state = 12932

In [62]:
with open('train.pkl', 'rb') as fp:
    train_X_all, train_y_all = cPickle.load(fp)
with open('test.pkl', 'rb') as fp:
    test_X_all, test_y_all = cPickle.load(fp)

In [63]:
train_X_all.shape, train_y_all.shape, test_X_all.shape, test_y_all.shape

((20631, 71), (20631, 3), (100, 71), (100, 3))

### Regression

In [129]:
train_y = train_y_all['RUL']
test_y = test_y_all['RUL']
top_features = utils.select_top_features(train_X_all, target_col='RUL', n=35)
train_X = train_X_all[top_features]
test_X = test_X_all[top_features]

In [116]:
def sum_abs_ruldiff(estimator):
    diffs = []
    for predicted,actual in zip(estimator.predict(test_X), test_y):
        diffs.append(abs(predicted - actual))
    return sum(diffs)

In [131]:
# Random forest regressor
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Find best hyperparameters with grid search
# Setting used in azureml sample: n_tree=8, max_depth=32, n_random_splits=128, min_n_sample=1
parameters = {'n_estimators': [6,8,10,15], 'max_depth': [16, 24, 32, 48], 'min_samples_leaf': [1,2,4],
     'random_state': [random_state] }
estimator = GridSearchCV(RandomForestRegressor(), parameters, n_jobs=-1)
estimator.fit(train_X, train_y)
print('Best params', estimator.best_params_)
print('Score', estimator.score(test_X, test_y))
print('Sum abs RUL diff', sum_abs_ruldiff(estimator))

('Best params', {'n_estimators': 15, 'random_state': 12932, 'max_depth': 16, 'min_samples_leaf': 4})
('Score', 0.50960074417892909)
('Sum abs RUL diff', 2066.301179204298)


In [132]:
# Gradient boosting regression
from sklearn.ensemble import GradientBoostingRegressor

parameters = {
    'loss': ['ls', 'lad'],
    'learning_rate': [1e-2, 0.1, 0.2, 0.3],
    'n_estimators': [80, 100, 150],
    'max_depth': [3, 5, 10],
    'random_state': [random_state]
}
estimator = GridSearchCV(GradientBoostingRegressor(), parameters, n_jobs=-1)
estimator.fit(train_X, train_y)
print('Best params', estimator.best_params_)
print('Score', estimator.score(test_X, test_y))
print('Sum abs RUL diff', sum_abs_ruldiff(estimator))

('Best params', {'n_estimators': 80, 'loss': 'lad', 'learning_rate': 0.1, 'random_state': 12932, 'max_depth': 3})
('Score', 0.70571124000148366)
('Sum abs RUL diff', 1673.2544408767842)


In [130]:
# Ridge regression
from sklearn.linear_model import Ridge

parameters = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 1.5, 2.0]
}
estimator = GridSearchCV(Ridge(), parameters, n_jobs=-1)
estimator.fit(train_X, train_y)
print('Best params', estimator.best_params_)
print('Score', estimator.score(test_X, test_y))
print('Sum abs RUL diff', sum_abs_ruldiff(estimator))

('Best params', {'alpha': 2.0})
('Score', 0.42731130175601495)
('Sum abs RUL diff', 2641.7131621300728)


### Binary Classification

In [120]:
train_y = train_y_all['label1']
test_y = test_y_all['label1']
top_features = utils.select_top_features(train_X_all, target_col='label1', n=35)
train_X = train_X_all[top_features]
test_X = test_X_all[top_features]

In [122]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

parameters = {
    'tol': [1e-5, 1e-4, 1e-3, 1e-2],
    'C': [1e-2, 1e-1, 1.0, 2.0, 5.0],
    'random_state': [random_state]
}
estimator = GridSearchCV(LogisticRegression(), parameters, n_jobs=-1)
estimator.fit(train_X, train_y)
print('Best params', estimator.best_params_)
print('Score', estimator.score(test_X, test_y))

('Best params', {'C': 5.0, 'random_state': 12932, 'tol': 1e-05})
('Score', 0.92000000000000004)


In [123]:
# Gradient boosting
from sklearn.ensemble import GradientBoostingClassifier

parameters = {
    'loss': ['deviance'],
    'learning_rate': [1e-2, 0.1, 0.2, 0.3],
    'n_estimators': [80, 100, 150, 200],
    'max_depth': [3, 5, 10, 20],
    'random_state': [random_state]
}
estimator = GridSearchCV(GradientBoostingClassifier(), parameters, n_jobs=-1)
estimator.fit(train_X, train_y)
print('Best params', estimator.best_params_)
print('Score', estimator.score(test_X, test_y))

('Best params', {'n_estimators': 100, 'loss': 'deviance', 'learning_rate': 0.2, 'random_state': 12932, 'max_depth': 3})
('Score', 0.92000000000000004)


In [124]:
# Extra tree classifier 
from sklearn.ensemble import ExtraTreesClassifier

parameters = {
    'n_estimators': [8,10,15, 20], 
    'max_depth': [16, 24, 32, 48], 
    'min_samples_leaf': [1,2,4],
    'random_state': [random_state] 
}
estimator = GridSearchCV(ExtraTreesClassifier(), parameters, n_jobs=-1)
estimator.fit(train_X, train_y)
print('Best params', estimator.best_params_)
print('Score', estimator.score(test_X, test_y))

('Best params', {'n_estimators': 15, 'random_state': 12932, 'max_depth': 48, 'min_samples_leaf': 2})
('Score', 0.90000000000000002)
