In [2]:
import preprocessing as pp

import numpy as np

from sklearn.preprocessing import RobustScaler, FunctionTransformer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression
from sklearn.feature_selection import SelectPercentile, SelectFromModel, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.pipeline import FeatureUnion, Pipeline

import lightgbm as lgb

from xgboost import XGBRegressor

In [3]:
train, test = pp.read_train_test(train_file = 'train.csv', test_file = 'test.csv')

In [4]:
ids = list(test.ID)

In [5]:
train_X = train.drop(['ID','target'], axis=1)
train_y = (np.log1p(train.target)).values

test_X = test.drop(['ID'], axis=1)

In [6]:
scaler = RobustScaler()

In [7]:
threshold = .98 * (1 - .98)
variance = VarianceThreshold(threshold)

In [8]:
anova_filter = SelectKBest(f_regression, k=4000)

In [9]:
model_rforest = RandomForestRegressor(n_estimators = 50, 
                                      max_features = 0.7,
                                      random_state=2018,
                                      max_depth=20,
                                      min_samples_leaf = 4,
                                      min_samples_split = 10)

In [10]:
model_lgb = lgb.LGBMRegressor(objective='regression',
                              metric="rmse",
                              n_estimators = 500,
                              num_leaves = 30,
                              learning_rate = 0.01,
                              bagging_fraction = 0.7,
                              feature_fraction = 0.7,
                              bagging_frequency = 5,
                              bagging_seed = 2018,
                              verbosity = -1)

In [11]:
model_xgb = XGBRegressor(n_estimators = 100, 
                         colsample_bytree = 0.7,
                         colsample_bylevel = 0.7,
                         learning_rate=0.1)

In [12]:
model_byr = BayesianRidge()

percentile = SelectPercentile(mutual_info_regression, percentile=85)

from_model = SelectFromModel(ExtraTreesRegressor(n_estimators=200, max_depth=20, max_features=0.5, n_jobs=-1, random_state=0))

In [13]:
log_transformer = FunctionTransformer(np.log1p)

In [14]:
feature_selection = []
feature_selection.append(('percentile', percentile))
feature_selection.append(('from_model', from_model))
feature_selection_union = FeatureUnion(feature_selection)

In [22]:
estimators = []
estimators.append(('low_variance', variance))
#estimators.append(('scaler', scaler))
#estimators.append(('anova', anova_filter))
#estimators.append(('log_transform', log_transformer))
#estimators.append(('percentile', percentile))
estimators.append(('from_model', from_model))
#estimators.append(('feature_selection', feature_selection_union))

In [23]:
pipe = Pipeline(estimators)
pipe.fit(train_X, train_y)

Pipeline(memory=None,
     steps=[('low_variance', VarianceThreshold(threshold=0.019600000000000017)), ('from_model', SelectFromModel(estimator=ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=20,
          max_features=0.5, max_leaf_nodes=None, min_impurity_decrease=0.0,
          min_impurity_split=None, min_... random_state=0, verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=None))])

In [24]:
train_X_reduced = pipe.transform(train_X)
test_X_reduced = pipe.transform(test_X)

In [25]:
print(train_X_reduced.shape)
print(test_X_reduced.shape)

(4459, 1000)
(49342, 1000)


In [26]:
tree_models = []
tree_models.append(("lgb", model_lgb))
#tree_models.append(("rf", model_rforest))
#tree_models.append(("xgb", model_xgb))
#tree_models.append(("byr", model_byr))

In [None]:
cross_val_table = pp.get_validation_scores(tree_models, train_X_reduced, train_y, 5)
print(cross_val_table)

In [21]:
pp.make_submission(model_lgb, train_X_reduced, train_y, test_X_reduced, ids, filename = 'submission.csv')