In [1]:
# Ready to use kaggle api:
# https://github.com/Kaggle/kaggle-api

!kaggle competitions files -c house-prices-advanced-regression-techniques

name                    size  creationDate         
---------------------  -----  -------------------  
data_description.txt    13KB  2019-12-15 21:33:33  
sample_submission.csv   31KB  2019-12-15 21:33:33  
train.csv              450KB  2019-12-15 21:33:33  
test.csv               441KB  2019-12-15 21:33:33  


In [7]:
# You must agree the competition rules prior to the downloading dataset:
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

!kaggle competitions download -c house-prices-advanced-regression-techniques -f train.csv -p data
!kaggle competitions download -c house-prices-advanced-regression-techniques -f test.csv -p data
!kaggle competitions download -c house-prices-advanced-regression-techniques -f data_description.txt -p data

Downloading train.csv to data
  0%|                                                | 0.00/450k [00:00<?, ?B/s]
100%|████████████████████████████████████████| 450k/450k [00:00<00:00, 7.99MB/s]
Downloading test.csv to data
  0%|                                                | 0.00/441k [00:00<?, ?B/s]
100%|████████████████████████████████████████| 441k/441k [00:00<00:00, 6.92MB/s]
Downloading data_description.txt to data
  0%|                                               | 0.00/13.1k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 13.1k/13.1k [00:00<00:00, 11.1MB/s]


In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold, GridSearchCV
from sklearn import preprocessing

train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
data = pd.concat([train, test], sort=False).reset_index(drop=True)
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0


In [2]:
"""Preprocessing
"""

categorical_columns = []

for col, dtype in zip(data.columns, data.dtypes):
    if dtype == "object":
        data[col] = data[col].replace(float("nan"), "NaN")
        categorical_columns.append(col)
    else:
        data[col] = data[col].fillna(-1)

le = preprocessing.LabelEncoder()
for c in categorical_columns:
    le.fit(list(set(data[c])))
    data[c] = le.transform(data[c])
    
train, test = data[:len(train)], data[len(train):]

X = train.drop(columns=["SalePrice", "Id"])
y = train["SalePrice"].astype(float)

test = test.drop(columns=["SalePrice", "Id"])

In [3]:
"""Cut outliers
"""

from sklearn.ensemble import IsolationForest

clf = IsolationForest(max_samples=100, random_state=42)
clf.fit(X)
y_noano = clf.predict(X)
y_noano = pd.DataFrame(y_noano, columns=["Top"])
X = X.iloc[y_noano[y_noano["Top"] == 1].index.values].reset_index(drop=True)
y = y.iloc[y_noano[y_noano["Top"] == 1].index.values].reset_index(drop=True)

print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0])
print("Number of rows without outliers:", X.shape[0])

Number of Outliers: 111
Number of rows without outliers: 1349


In [4]:
"""Scaling
"""

# mat_train = np.matrix(train)
# mat_test = np.matrix(test)
# mat_y = np.array(train["SalePrice"]).reshape((len(train), 1))

mms_y = preprocessing.MinMaxScaler()
mms_y.fit(y.values.reshape((len(X), 1)))

mms = preprocessing.MinMaxScaler()
mms.fit(X.values)

mms_test = preprocessing.MinMaxScaler()
mms_test.fit(test.values)

X = pd.DataFrame(mms.transform(X.values), columns=X.columns)
test = pd.DataFrame(mms_test.transform(test.values), columns=test.columns)

X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.235294,0.8,0.210191,0.062802,1.0,0.5,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.333333,0.0,0.090909,0.5,1.0,0.8
1,0.0,0.8,0.257962,0.072904,1.0,0.5,1.0,1.0,0.0,0.5,...,0.0,0.0,1.0,1.0,0.333333,0.0,0.363636,0.25,1.0,0.8
2,0.235294,0.8,0.219745,0.087396,1.0,0.5,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.333333,0.0,0.727273,0.5,1.0,0.8
3,0.294118,0.8,0.194268,0.072464,1.0,0.5,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.333333,0.0,0.090909,0.0,1.0,0.0
4,0.235294,0.8,0.270701,0.113835,1.0,0.5,0.0,1.0,0.0,0.5,...,0.0,0.0,1.0,1.0,0.333333,0.0,1.0,0.5,1.0,0.8


In [6]:
%%time

from sklearn.linear_model import ElasticNet
import warnings
warnings.filterwarnings('ignore')

enet_model = ElasticNet(
    alpha=0.0005,
    l1_ratio=0.9,
    random_state=3,
)

enet_model.fit(X, y)
r2_score(enet_model.predict(X), y)

CPU times: user 276 ms, sys: 85.1 ms, total: 361 ms
Wall time: 68.8 ms


0.8955562579705032

In [6]:
%%time

from catboost import CatBoostRegressor

cb_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=10,
    random_seed=42,
    bagging_temperature=0.2,
    od_type="Iter",
    metric_period=50,
    od_wait=20,
)

cb_model.fit(X, y)
r2_score(cb_model.predict(X), y)

0:	learn: 65718.3021685	total: 68ms	remaining: 33.9s
50:	learn: 22014.5119625	total: 946ms	remaining: 8.32s
100:	learn: 14727.6169486	total: 2.04s	remaining: 8.08s
150:	learn: 11667.9680263	total: 3.27s	remaining: 7.55s
200:	learn: 9587.5430110	total: 4.75s	remaining: 7.07s
250:	learn: 7806.1061150	total: 6.56s	remaining: 6.51s
300:	learn: 6447.1232646	total: 8.28s	remaining: 5.47s
350:	learn: 5470.8751770	total: 10.1s	remaining: 4.3s
400:	learn: 4598.8963814	total: 12s	remaining: 2.96s
450:	learn: 3931.3566761	total: 14s	remaining: 1.52s
499:	learn: 3460.4252215	total: 16s	remaining: 0us
CPU times: user 1min 3s, sys: 41.4 s, total: 1min 45s
Wall time: 16.2 s


0.99736437394909

In [22]:
%%time

import tensorflow as tf
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

# import logging

# logger = logging.getLogger(__name__)

# K = tf.keras.backend
# def root_mean_squared_error(y_true, y_pred):
#     """https://stackoverflow.com/questions/43855162/rmse-rmsle-loss-function-in-keras/43863854"""
#     return K.sqrt(K.mean(K.square(y_pred - y_true))) 

# learning_rate = 0.1
# warmup_epochs = 1

# def warmup(epochs):
#     _learning_rate = learning_rate
#     if epochs < warmup_epochs:
#         _learning_rate *= 0.1
#     logger.debug(f"Learning rate = {_learning_rate}")
#     return _learning_rate

inputs = tf.keras.Input((X.shape[1],), name="mlp_input")
x = tf.keras.layers.Dense(128, activation="relu", name="dense1")(inputs)
x = tf.keras.layers.Dropout(0.2, name="dropout1")(x)
x = tf.keras.layers.Dense(64, activation="relu", name="dense2")(x)
x = tf.keras.layers.Dropout(0.2, name="dropout2")(x)
x = tf.keras.layers.Dense(32, activation="relu", name="dense3")(x)
x = tf.keras.layers.Dropout(0.2, name="dropout3")(x)
outputs = tf.keras.layers.Dense(1, name="mlp_output")(x)
mlp_model = tf.keras.Model(inputs, outputs, name="mlp_model")
# mlp_model.compile(optimizer="rmsprop", loss=root_mean_squared_error, metrics=["mae", "mse", "acc"])
mlp_model.compile(
    optimizer=tf.keras.optimizers.RMSprop(0.001), 
    loss="mse", 
    metrics=["mae"],
)

mlp_model.summary()

history = mlp_model.fit(
    X, 
    y,
    epochs=1000,
    validation_split=0.2,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=20),
#         tf.keras.callbacks.LearningRateScheduler(warmup),
        tfdocs.modeling.EpochDots(),
    ],
    verbose=0,  # verbosity: 0-2
)



r2_score(mlp_model.predict(X), y)

Model: "mlp_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
mlp_input (InputLayer)       [(None, 79)]              0         
_________________________________________________________________
dense1 (Dense)               (None, 128)               10240     
_________________________________________________________________
dropout1 (Dropout)           (None, 128)               0         
_________________________________________________________________
dense2 (Dense)               (None, 64)                8256      
_________________________________________________________________
dropout2 (Dropout)           (None, 64)                0         
_________________________________________________________________
dense3 (Dense)               (None, 32)                2080      
_________________________________________________________________
dropout3 (Dropout)           (None, 32)                0 

0.8869135649166006

# Sacred based

In [11]:
# Ready to use nepture project: NEPTUNE_API_TOKEN and NEPTURE_PROJECT_NAME
# https://docs.neptune.ai/#get-started
# https://docs.neptune.ai/integrations/sacred.html
from sacred import Experiment
from neptunecontrib.monitoring.sacred import NeptuneObserver
import os
import warnings
warnings.filterwarnings('ignore')

ex = Experiment("elastic_net", interactive=True)

# ex.observers.append(FileStorageObserver("sacredruns"))
# Observerを追加するだけなのでカンタン
ex.observers.append(NeptuneObserver(api_token=os.environ["NEPTUNE_API_TOKEN"],
                                    project_name=os.environ["NEPTURE_PROJECT_NAME"]))

@ex.config
def parameters():
    alpha = 0.0005
    l1_ratio = 0.9
    random_state = 3

@ex.capture
def get_model(alpha: float, l1_ratio: float, random_state: int):
    from sklearn.linear_model import ElasticNet
    print("Parameters:", alpha, l1_ratio, random_state)
    return ElasticNet(
        alpha=alpha,
        l1_ratio=l1_ratio,
        random_state=random_state,
    )

@ex.main
def run():
    model = get_model()  # Parameters are injected implicitly.
    model.fit(X, y)
    return r2_score(model.predict(X), y)

ex.run(config_updates=dict(l1_ratio=0.01))

INFO - elastic_net - Running command 'run'


https://ui.neptune.ai/chck/ml-management-tools/e/MLMAN-10


INFO - elastic_net - Started
INFO - elastic_net - Result: 0.8940958224647267
INFO - elastic_net - Completed after 0:00:02


Parameters: 0.0005 0.01 3


<sacred.run.Run at 0x13170fd90>

In [8]:
# Ready to use nepture project: NEPTUNE_API_TOKEN and NEPTURE_PROJECT_NAME
# https://docs.neptune.ai/#get-started
# https://docs.neptune.ai/integrations/sacred.html
from sacred import Experiment
from neptunecontrib.monitoring.sacred import NeptuneObserver
import os
import warnings
warnings.filterwarnings('ignore')

ex = Experiment("catboost", interactive=True)

# ex.observers.append(FileStorageObserver("sacredruns"))
# Observerを追加するだけなのでカンタン
ex.observers.append(NeptuneObserver(api_token=os.environ["NEPTUNE_API_TOKEN"],
                                    project_name=os.environ["NEPTURE_PROJECT_NAME"]))

@ex.config
def parameters():
    learning_rate = 0.05
    depth = 10
    bagging_temperature = 0.2
    
@ex.capture
def get_model(learning_rate: float, depth: int, bagging_temperature: float):
    print("Parameters:", learning_rate, depth, bagging_temperature)
    from catboost import CatBoostRegressor
    return CatBoostRegressor(
        iterations=500,
        learning_rate=learning_rate,
        depth=depth,
        random_seed=42,
        bagging_temperature=bagging_temperature,
        od_type="Iter",
        metric_period=50,
        od_wait=20,
    )

@ex.main
def run():
    model = get_model()  # Parameters are injected implicitly.
    model.fit(X, y)
    return r2_score(model.predict(X), y)

ex.run()

INFO - catboost - Running command 'run'


https://ui.neptune.ai/chck/ml-management-tools/e/MLMAN-12


INFO - catboost - Started


Parameters: 0.05 10 0.2
0:	learn: 65718.3021685	total: 43.9ms	remaining: 21.9s
50:	learn: 22014.5119625	total: 1.9s	remaining: 16.7s
100:	learn: 14727.6169486	total: 3.72s	remaining: 14.7s
150:	learn: 11667.9680263	total: 5.71s	remaining: 13.2s
200:	learn: 9587.5430110	total: 7.65s	remaining: 11.4s
250:	learn: 7806.1061150	total: 9.54s	remaining: 9.47s
300:	learn: 6447.1232646	total: 11.6s	remaining: 7.66s
350:	learn: 5470.8751770	total: 13.6s	remaining: 5.77s
400:	learn: 4598.8963814	total: 15.7s	remaining: 3.88s
450:	learn: 3931.3566761	total: 17.6s	remaining: 1.91s


INFO - catboost - Result: 0.99736437394909
INFO - catboost - Completed after 0:00:21


499:	learn: 3460.4252215	total: 19.6s	remaining: 0us


<sacred.run.Run at 0x13465a5d0>

In [9]:
%reload_ext tensorboard

from tensorboard import notebook
notebook.list()

No known TensorBoard instances running.


In [10]:
%tensorboard --logdir /tmp/tensorboard/house_prices/logs --host=0.0.0.0 --port 6007

In [5]:
import tensorflow as tf
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
import neptune_tensorboard as neptune_tb
neptune_tb.integrate_with_tensorflow()
from sacred import Experiment
from neptunecontrib.monitoring.sacred import NeptuneObserver
import os
import warnings
warnings.filterwarnings('ignore')

ex = Experiment("mlp", interactive=True)

ex.observers.append(NeptuneObserver(api_token=os.environ["NEPTUNE_API_TOKEN"],
                                    project_name=os.environ["NEPTURE_PROJECT_NAME"]))


@ex.config
def parameters():
    hidden_unit: int = 128
    dropout_rate: float = 0.2
    learning_rate: float = 0.001

@ex.capture
def get_model(hidden_unit: int, dropout_rate: float, learning_rate: float):
    print("Parameters:", hidden_unit, dropout_rate, learning_rate)
    inputs = tf.keras.Input((X.shape[1],), name="mlp_input")
    x = tf.keras.layers.Dense(hidden_unit, activation="relu", name="dense1")(inputs)
    x = tf.keras.layers.Dropout(dropout_rate, name="dropout1")(x)
    x = tf.keras.layers.Dense(hidden_unit//2, activation="relu", name="dense2")(x)
    x = tf.keras.layers.Dropout(dropout_rate, name="dropout2")(x)
    x = tf.keras.layers.Dense(hidden_unit//4, activation="relu", name="dense3")(x)
    x = tf.keras.layers.Dropout(dropout_rate, name="dropout3")(x)
    outputs = tf.keras.layers.Dense(1, name="mlp_output")(x)
    mlp_model = tf.keras.Model(inputs, outputs, name="mlp_model")
    mlp_model.compile(
        optimizer=tf.keras.optimizers.RMSprop(learning_rate), 
        loss="mse", 
        metrics=["mae"],
    )
    return mlp_model

@ex.main
def run():
    model = get_model()  # Parameters are injected implicitly.
    model.fit(X, 
              y,
              epochs=1000,
              validation_split=0.2,
              callbacks=[
                  tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=20),
                  tfdocs.modeling.EpochDots(),
                  tf.keras.callbacks.TensorBoard(log_dir="/tmp/tensorboard/house_prices/logs"),
              ],
              verbose=0)
    return r2_score(model.predict(X), y)

ex.run()

INFO - mlp - Running command 'run'


https://ui.neptune.ai/chck/ml-management-tools/e/MLMAN-14


INFO - mlp - Started


Parameters: 128 0.2 0.001

Epoch: 0, loss:36477490533.7831,  mae:178359.5625,  val_loss:36446607633.0667,  val_mae:178780.1094,  
....................................................................................................
Epoch: 100, loss:1362170841.0306,  mae:27876.1465,  val_loss:665374545.7778,  val_mae:18393.5781,  
....................................................................................................
Epoch: 200, loss:1354088602.4541,  mae:27591.6230,  val_loss:571678412.8000,  val_mae:17575.8887,  
....................................................................................................
Epoch: 300, loss:1249802814.1019,  mae:26314.1211,  val_loss:602744613.9259,  val_mae:17881.9902,  
....................................................................................................
Epoch: 400, loss:1232381094.5542,  mae:26380.0762,  val_loss:533573413.9259,  val_mae:16652.0176,  
......................................

INFO - mlp - Result: 0.8817485109935098
INFO - mlp - Completed after 0:00:24


..

<sacred.run.Run at 0x1474aedd0>

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [27]:
import neptune
import lightgbm as lgb
import os

params = {
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'n_estimators': 500,
    
}

neptune.init(os.environ["NEPTURE_PROJECT_NAME"])
neptune.create_experiment(name="lightgbm", params=params)

def neptune_callback():
    def callback(env):
        for name, loss_name, loss_value, _ in env.evaluation_result_list:
            neptune.send_metric('{}_{}'.format(name, loss_name), x=env.iteration, y=loss_value)
    return callback

model = lgb.LGBMRegressor(**params)
model.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
    eval_metric=["mae", "mse"],
    callbacks=[
        lgb.early_stopping(10),
        neptune_callback(),
    ]
)

neptune.stop()

https://ui.neptune.ai/chck/ml-management-tools/e/MLMAN-25
[1]	valid_0's l1: 0.461921	valid_0's l2: 0.410647
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l1: 0.441873	valid_0's l2: 0.374182
[3]	valid_0's l1: 0.425393	valid_0's l2: 0.340401
[4]	valid_0's l1: 0.40778	valid_0's l2: 0.312373
[5]	valid_0's l1: 0.390555	valid_0's l2: 0.283232
[6]	valid_0's l1: 0.374696	valid_0's l2: 0.257286
[7]	valid_0's l1: 0.359833	valid_0's l2: 0.236476
[8]	valid_0's l1: 0.347052	valid_0's l2: 0.215808
[9]	valid_0's l1: 0.334858	valid_0's l2: 0.197306
[10]	valid_0's l1: 0.322682	valid_0's l2: 0.181439
[11]	valid_0's l1: 0.311658	valid_0's l2: 0.166562
[12]	valid_0's l1: 0.301727	valid_0's l2: 0.153485
[13]	valid_0's l1: 0.290481	valid_0's l2: 0.141128
[14]	valid_0's l1: 0.281463	valid_0's l2: 0.130735
[15]	valid_0's l1: 0.273525	valid_0's l2: 0.12276
[16]	valid_0's l1: 0.264474	valid_0's l2: 0.113957
[17]	valid_0's l1: 0.25475	valid_0's l2: 0.105174
[18]	valid_0's l1: 0.24794

# Hydra + MLflow-Tracking based