In [1]:
import gc
import os
import random

import lightgbm as lgb
import numpy as np
import datetime
import pickle
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from tqdm import tqdm_notebook
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold,StratifiedKFold

In [2]:
## no memory limitation

In [3]:
seed = 2199

def seed_everything(seed=seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything()

In [4]:
building = pd.read_csv("../input/ashrae-energy-prediction/building_metadata.csv")
# sample_submission = pd.read_csv("../input/ashrae-energy-prediction/sample_submission.csv")
# test = pd.read_csv("../input/ashrae-energy-prediction/test.csv")
train = pd.read_csv("../input/ashrae-energy-prediction/train.csv")
# weather_test = pd.read_csv("../input/ashrae-energy-prediction/weather_test.csv")
weather_train = pd.read_csv("../input/ashrae-energy-prediction/weather_train.csv")


In [5]:
def set_localtime(df):
    
    zone_dict={0:4,1:0,2:7,3:4,4:7,5:0,6:4,7:4,8:4,9:5,10:7,11:4,12:0,13:5,14:4,15:4} 

    df.timestamp = pd.to_datetime(df.timestamp, format="%Y-%m-%d %H:%M:%S")

    for sid, zone in zone_dict.items():
        sids = df.site_id == sid   
        df.loc[sids, 'timestamp'] = df[sids].timestamp - pd.offsets.Hour(zone)
        
        # sort 
        df = df.sort_values(by=['timestamp']).reset_index(drop=True)
    df.timestamp = df.timestamp.astype(str)

    return df

In [6]:
def add_lag_feature(weather_df, window=3):
    group_df = weather_df.groupby('site_id')
    cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr']
    rolled = group_df[cols].rolling(window=window, min_periods=0)
    lag_mean = rolled.mean().reset_index().astype(np.float16)
    lag_max = rolled.max().reset_index().astype(np.float16)
    lag_min = rolled.min().reset_index().astype(np.float16)
    lag_std = rolled.std().reset_index().astype(np.float16)
    for col in cols:
        weather_df[f'{col}_mean_lag{window}'] = lag_mean[col]
#         weather_df[f'{col}_max_lag{window}'] = lag_max[col]
#         weather_df[f'{col}_min_lag{window}'] = lag_min[col]
        weather_df[f'{col}_std_lag{window}'] = lag_std[col]
    
    return weather_df

In [7]:
## fill na in weather dataset
weather_train = weather_train.groupby('site_id').apply(lambda group: group.interpolate(limit_direction='both'))

## shift the time zone
weather_train = set_localtime(weather_train)

weather_train['tem_diff'] = weather_train['air_temperature']-weather_train['dew_temperature']

In [8]:
# weather_train = add_lag_feature(weather_train)

In [9]:
weather_train.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,tem_diff
0,2,2015-12-31 17:00:00,15.6,6.0,-5.6,0.0,1015.3,270.0,3.6,21.2
1,4,2015-12-31 18:00:00,9.4,0.0,-2.2,0.0,1021.4,360.0,3.1,11.6
2,2,2015-12-31 18:00:00,13.9,4.0,-5.6,0.0,1015.6,270.0,4.1,19.5
3,10,2015-12-31 18:00:00,-10.6,0.0,-13.9,0.0,1036.7,0.0,0.0,3.3
4,2,2015-12-31 19:00:00,13.3,2.0,-5.6,0.0,1016.0,270.0,3.1,18.9


In [10]:
## Memory optimization

# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [11]:
## site 0
train = train.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

## site 13
train = train.query('not (building_id == 1099 & meter == 2)')
train = train.query('not (building_id == 1088 & meter == 1)')
train = train.query('not (building_id == 1169 & meter == 0)')

In [12]:
train = reduce_mem_usage(train)
building = reduce_mem_usage(building)
weather_train = reduce_mem_usage(weather_train)

Memory usage of dataframe is 756.97 MB
Memory usage after optimization is: 322.09 MB
Decreased by 57.4%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 65.4%
Memory usage of dataframe is 10.66 MB
Memory usage after optimization is: 5.05 MB
Decreased by 52.7%


In [13]:
le = LabelEncoder()
le.fit(building['primary_use'])
building['primary_use'] = le.transform(building['primary_use'])

In [14]:
def get_features(df,building,weather):
    
    df = df.merge(building, on='building_id', how='left')
    df = df.merge(weather, on=['site_id', 'timestamp'], how='left')

    df.square_feet = np.log1p(df.square_feet)
    
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                "2017-01-01", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                "2019-01-01"]
    
    
    df.timestamp = pd.to_datetime(df.timestamp, format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df.timestamp.dt.hour
    df["weekday"] = df.timestamp.dt.weekday
    df["is_holiday"] = (df.timestamp.dt.date.astype("str").isin(holidays)).astype(int)
    
    drop_features = ["sea_level_pressure", "wind_direction", "wind_speed"]
    
#     df["month"] = df.timestamp.dt.month
        
    return df

In [15]:
train = get_features(train, building, weather_train)

In [16]:
train = train.sort_values(by=['timestamp']).reset_index(drop = True)

In [17]:
x = train.drop(columns = ['meter_reading','timestamp'])
y = np.log1p(train['meter_reading'])

In [18]:
columns = x.columns

In [19]:
columns

Index(['building_id', 'meter', 'site_id', 'primary_use', 'square_feet',
       'year_built', 'floor_count', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'tem_diff', 'hour', 'weekday',
       'is_holiday'],
      dtype='object')

In [20]:
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse"
}

In [21]:
NFOLDS = 2

# kf = StratifiedKFold(n_splits=NFOLDS, random_state=seed)

kf = KFold(n_splits=NFOLDS,random_state =seed)
columns = x.columns
y_oof = np.zeros(x.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns

model_list = list()


for fold_n, (train_index, valid_index) in enumerate(kf.split(x,y)):
    x_train, x_valid = x[columns].iloc[train_index], x[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    categorical_features = ["building_id", "site_id", "meter", "primary_use"]

    dtrain = lgb.Dataset(x_train, label=y_train, categorical_feature=categorical_features, free_raw_data=False)
    dvalid = lgb.Dataset(x_valid, label=y_valid, categorical_feature=categorical_features, free_raw_data=False)


    clf = lgb.train(params, dtrain, 20000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=200)
    model_list.append(clf)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(x_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | RMSE: {np.sqrt(mean_squared_error(y_valid, y_pred_valid))}")
    
    score += mean_squared_error(y_valid, y_pred_valid) /NFOLDS
#     y_preds += clf.predict(x_test[columns]) /NFOLDS
    del x_train, x_valid, y_train, y_valid
    gc.collect()

print(f"\nMean RMSE = {score}")
print(f"Out of folds RMSE = {np.sqrt(mean_squared_error(y, y_oof))}")



Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.902219	valid_1's rmse: 1.13952
[400]	training's rmse: 0.845057	valid_1's rmse: 1.12882
[600]	training's rmse: 0.818842	valid_1's rmse: 1.12408
[800]	training's rmse: 0.793704	valid_1's rmse: 1.12279
Early stopping, best iteration is:
[724]	training's rmse: 0.801766	valid_1's rmse: 1.1225
Fold 1 | RMSE: 1.122504251309188




Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.860538	valid_1's rmse: 1.13342
[400]	training's rmse: 0.808721	valid_1's rmse: 1.12477
[600]	training's rmse: 0.783718	valid_1's rmse: 1.12407
Early stopping, best iteration is:
[503]	training's rmse: 0.794516	valid_1's rmse: 1.1232
Fold 2 | RMSE: 1.123201136926945

Mean RMSE = 1.2607982941005913
Out of folds RMSE = 1.1228527481647619


In [22]:
with open("./model_list.pkl","wb") as f:    
    pickle.dump(model_list,f)

In [23]:
del x, y ,train, dtrain, dvalid
gc.collect()

weather_test = pd.read_csv("../input/ashrae-energy-prediction/weather_test.csv")
sample = pd.read_csv("../input/ashrae-energy-prediction/sample_submission.csv")

In [24]:
## fill na in weather dataset
weather_test = weather_test.groupby('site_id').apply(lambda group: group.interpolate(limit_direction='both'))

## shift the time zone
weather_test = set_localtime(weather_test)

weather_test['tem_diff'] = weather_test['air_temperature']-weather_test['dew_temperature']

## add lag features
# weather_test = add_lag_feature(weather_test)

In [25]:
weather_test = reduce_mem_usage(weather_test)

Memory usage of dataframe is 21.15 MB
Memory usage after optimization is: 10.01 MB
Decreased by 52.7%


In [26]:
def load_test(chunk_size):
    for df in pd.read_csv('../input/ashrae-energy-prediction/test.csv', chunksize=chunk_size):
        yield df
        
CHUNK_SIZE = 10**6
        
     
test_ids = np.array([], dtype=np.int32)
preds = np.array([], dtype=np.float32)

for test_df in tqdm_notebook(load_test(CHUNK_SIZE)):
    
    test_df = get_features(test_df,building, weather_test)
    chunk_preds = np.expm1(model_list[0].predict(test_df[columns]))
    
    
    for model in model_list[1:]:
        chunk_preds += np.expm1(model.predict(test_df[columns]))
    chunk_preds /= len(model_list)

    preds = np.append(preds, chunk_preds)
    test_ids = np.append(test_ids, test_df["row_id"].values)

submission = pd.DataFrame({
    "row_id": test_ids,
    "meter_reading": preds
})

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [27]:
submission['meter_reading'] = submission['meter_reading'].apply(lambda x : x if x>0  else 0)

In [28]:
submission.to_csv("submission.csv", index=False)

In [29]:
submission.head()

Unnamed: 0,row_id,meter_reading
0,0,131.399004
1,1,69.303399
2,2,11.352753
3,3,197.722369
4,4,831.123332
