The idea of this kernel is to see the baseline scores using the basic features (shown in inversion's kernel) and just putting them through 5 fold CV to establish starting scores. Any model with extensive feature engineering and extensive CV strategy will use these baseline scores as a starting point. 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
## Parts of code taken from https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
## Basic features taken from https://www.kaggle.com/inversion/basic-feature-benchmark

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.display.precision = 15
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.metrics import mean_absolute_error
from scipy import stats
from scipy.stats import norm, skew #for some statistics
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv('../input/train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})


In [None]:
rows = 150_000
segments = int(np.floor(train.shape[0] / rows))

X_train = pd.DataFrame(index=range(segments), dtype=np.float64, columns=['ave', 'std', 'max', 'min'])
y_train = pd.DataFrame(index=range(segments), dtype=np.float64, columns=['time_to_failure'])

for segment in tqdm(range(segments)):
    seg = train.iloc[segment*rows:segment*rows+rows]
    x = seg['acoustic_data'].values
    y = seg['time_to_failure'].values[-1]
    
    y_train.loc[segment, 'time_to_failure'] = y
    
    X_train.loc[segment, 'ave'] = x.mean()
    X_train.loc[segment, 'std'] = x.std()
    X_train.loc[segment, 'max'] = x.max()
    X_train.loc[segment, 'min'] = x.min()

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

**Define Function for Cross Validation**

In [None]:
n_folds = 5

def mae_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train_scaled)
    mae= -cross_val_score(model, X_train_scaled, y_train, scoring="neg_mean_absolute_error", cv = kf)
    return(mae)

Let's try some simple models and view teir CV scores

In [None]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
score = mae_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
score = mae_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
KRR = KernelRidge(alpha=0.8, kernel='polynomial', degree=3, coef0=3.5)
score = mae_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
SVReg = SVR(gamma='scale', C=1.0, epsilon=0.2)
score = mae_cv(SVReg)
print("SVR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
NuSVReg = NuSVR(gamma='scale', C=1.0, nu=0.1)
score = mae_cv(NuSVReg)
print("NuSVR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**Create Class for simple averaging of models**

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

In [None]:
averaged_models = AveragingModels(models = (KRR, SVReg, NuSVReg))

score = mae_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**Fit models to training data**

In [None]:
NuSVReg.fit(X_train_scaled, y_train.values.flatten())
SVReg.fit(X_train_scaled, y_train.values.flatten())
KRR.fit(X_train_scaled, y_train.values.flatten())
ENet.fit(X_train_scaled, y_train.values.flatten())

Make predictions

In [None]:
submission = pd.read_csv('../input/sample_submission.csv', index_col='seg_id')
X_test = pd.DataFrame(columns=X_train.columns, dtype=np.float64, index=submission.index)
for seg_id in X_test.index:
    seg = pd.read_csv('../input/test/' + seg_id + '.csv')
    
    x = seg['acoustic_data'].values
    
    X_test.loc[seg_id, 'ave'] = x.mean()
    X_test.loc[seg_id, 'std'] = x.std()
    X_test.loc[seg_id, 'max'] = x.max()
    X_test.loc[seg_id, 'min'] = x.min()
X_test_scaled = scaler.transform(X_test)
submission['time_to_failure_NuSVReg'] = NuSVReg.predict(X_test_scaled)
submission['time_to_failure_SVReg'] = SVReg.predict(X_test_scaled)
submission['time_to_failure_KRR'] = KRR.predict(X_test_scaled)
submission['time_to_failure'] = (submission['time_to_failure_NuSVReg']+submission['time_to_failure_SVReg']+submission['time_to_failure_KRR'])/3.0
submission.drop(['time_to_failure_NuSVReg','time_to_failure_SVReg','time_to_failure_KRR'],axis=1, inplace=True)
submission.to_csv('submission.csv')