In [3]:
# !pip install catboost
# !pip install lightgbm
# !pip install feather-format

In [4]:
import numpy as np
import pandas as pd
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import GroupShuffleSplit
import random
import gc
import lightgbm as lgb
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import feather
import os
import glob
import csv   
from sklearn.metrics import mean_squared_error

Set seed

In [5]:
seed = 7
random.seed(seed)
np.random.seed(seed)

In [6]:
def plotImp(model, cols, num = 60):
    feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':cols})
    plt.figure(figsize=(40, 20))
    sns.set(font_scale = 3)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num], color='C0')
    plt.title('LightGBM Features')
    plt.tight_layout()
    plt.show()

# Load data

In [None]:
%%time
train = feather.read_dataframe('../../data-vol-2/Prepared_data/train_prepared.feather')
train['timestamp'] = pd.to_datetime(train['timestamp'])
train['meter_reading'] = np.log1p(train['meter_reading'])

train.drop(columns=['utc','utc_time','altitude','azimuth'], inplace=True)

gc.collect()
train['irradiance_cloud'] = train['irradiance']*(1-train['cloudCover'])

In [None]:
models = []

# Weather impute

In [None]:
for f in ['sea_level_pressure',
    'dew_temperature',
    'wind_speed',
    'cloud_coverage',
    'wind_direction',
    'precip_depth_1_hr',
    'air_temperature']:
    train[f] = np.where(train[f].isna(),train[f],train[f'new_{f}'])
    train = train.drop(columns=[f'new_{f}'])

gc.collect()

# Create train test split

Create CV 

In [None]:
group = train['site_id'].astype(str) + "_" + (train['timestamp'].dt.month).astype(str)

groupSplit = GroupShuffleSplit(5, test_size=0.2, random_state=7)
folds = groupSplit.split(train[['radiation']],
                         train['meter_reading'],
                         groups=group)


In [None]:
features = ['building_id', 
            'meter', 
            'site_id', 
            'primary_use',
            'square_feet', 
            'year_built', 
            'new_floor_count',
            'is_holiday', 
            'weekday_cos',
            'weekday_sin', 
            'weekday', 
            'hour_cos', 
            'hour_sin', 
            'year_cos',
            'year_sin', 
            'night_time', 
            'is_day_saving',
            'air_temperature',
            'cloud_coverage', 
            'dew_temperature',
            'precip_depth_1_hr', 
            'sea_level_pressure', 
            'wind_speed', 
            'cloudCover', 
            'humidity', 
            'irradiance', 
            'feels_like', 
            'irradiance_cloud'
#             'floor_count', 
#             'new_year_built', 
#             'square_feet_log', 
#             'new_sea_level_pressure',
#             'new_dew_temperature',
#             'new_wind_speed',
#             'new_cloud_coverage',            
#             'new_precip_depth_1_hr',
#             'new_air_temperature',
#             'new_wind_direction',
#             'radiation',
#             'wind_direction',            
#             'latitude',
#             'longitude',
#             'morning',
#             'afternoon',
#             'evening',
           ]

target = 'meter_reading'

In [None]:
for f in ['primary_use','is_day_saving','is_holiday']:
    train[f] = train[f].astype('category')

In [None]:
gc.collect()

# Train model

#### LightGBM

In [None]:
version = 2
lgb_params = {
              'objective':'regression',
              'boosting_type':'gbdt',
              'metric':'rmse',
              'n_jobs':-1,
              'learning_rate':0.05,
              'num_leaves': 2**8,
              'max_depth':-1,
              'tree_learner':'serial',
              'colsample_bytree': 0.9,
              'subsample_freq':1,
              'subsample':0.5,
              'n_estimators':2000,
              'max_bin':255,
              'verbose':2,
              'seed': seed,
              'early_stopping_rounds':100 
            } 

In [None]:
model_name = 'lgbm'

for i, (train_index, test_index) in enumerate(folds):  
    print(f"\n\n\n ===== Fold numer {i} =====")
    tr_data = lgb.Dataset(train.loc[train_index][features], label=train.loc[train_index][target])
    vl_data = lgb.Dataset(train.loc[test_index][features], label=train.loc[test_index][target])
    gc.collect()
    estimator = lgb.train(
                lgb_params,
                tr_data,
                valid_sets = [tr_data,vl_data],
                verbose_eval = 50)
    model_filename = 'models/' + model_name + '_v_' + str(version) + '__fold_' + str(i)  + '.bin'
    pickle.dump(estimator, open(model_filename, 'wb'))

    models.append(estimator)

#### Feature importance

In [None]:
feature_imp = pd.DataFrame({'Feature':tr_data.feature_name})

for i in range(1,len(models)):
    feature_imp[f'Importance_{i}'] = models[i].feature_importance()

feature_imp['mean'] = feature_imp[[col for col in feature_imp.columns if 'Importance' in col]].mean(axis=1).round(0)

feature_imp = feature_imp.sort_values('mean',ascending=False)

In [None]:
plt.figure(figsize=(40, 20))
sns.set(font_scale = 3)
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                    ascending=False)[0:num], color='C0')
plt.title('LightGBM Features')
plt.tight_layout()
plt.show()

### Performance measure

On test set

In [None]:
mean_performance = (sum([models[i].best_score['valid_1']['rmse'] for i in range(len(models))]))/len(models)
print(f"Mean performance: {round(mean_performance,3)}")

perf = [version, tr_data.feature_name, lgb_params, mean_performance]


with open(r'./models/perf.csv', 'a') as f:
    writer = csv.writer(f)
    writer.writerow(perf)

On leaked data

In [14]:
%%time
test = feather.read_dataframe(f"../../data-vol-2/Prepared_data/test_prepared.feather")

test = test.set_index('row_id')
test = test.sort_index()
gc.collect()

%%time
leaked = feather.read_dataframe(f"../../data-vol-2/Leaked_data/leaked_submission.feather")

test = test.loc[leaked['row_id']]
test['meter_reading']=leaked['meter_reading']
test['meter_reading_log'] = np.log1p(test['meter_reading'])

del leaked
gc.collect()

In [None]:
model_names = glob.glob('./models/*.bin') + glob.glob('./models/*.txt')[1:2]

for m in model_names:
    
    step = 1000000
    i = 0
    test['pred'] = np.nan
    estimator = pickle.load(open(m,'rb'))

    while i < test.shape[0]:
        print(i)
        subset = test.loc[i:(i+step)].reset_index()[estimator.feature_name()].copy()
        test.loc[i:(i+step),'pred'] = estimator.predict(subset)
        i = i + step
        gc.collect()
        
    error = np.sqrt(mean_squared_error(test['meter_reading_log'],test['pred']))
    
    with open(r'./models/perf_leaked.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow([m,error])
    
    print(m)
    print(error)

# Predict test set

In [None]:
# step = 10000000
# i = 0
# test['pred'] = np.nan

# while i < test.shape[0]:
#     print(i)
#     subset = test.loc[i:(i+step)].reset_index().copy()
#     subset = pd.merge(subset, weather, how='left', left_on=['site_id','utc_time'], right_on=['site_id','timestamp'])
#     subset = subset.sort_values('row_id')
#     pred = estimator.predict(subset[features])
#     test.loc[i:(i+step),'pred'] = pred
#     i = i + step
#     gc.collect()

In [None]:
submission = test[['pred']].copy()
submission['meter_reading'] = np.exp(submission['pred'])-1

submission[['meter_reading']].to_csv(f'../Prepared_data/submission_v{version}.csv')

In [None]:
gc.collect()

#### Catboost

Quantize data

In [None]:
# train_dataset = Pool(train.loc[train_index,features], train.loc[train_index,target],
#                      cat_features=cat_features)
# train_dataset.quantize()
# train_dataset.save('../../data-vol-1/Prepared_data/train.bin')

In [None]:
# model = CatBoostRegressor()
# #train the model
# model.fit(train_dataset) 
# # # make the prediction using the resulting model
# # preds_class = model.predict(test_pool, prediction_type='Class')
# # preds_proba = model.predict(test_pool, prediction_type='Probability')
# # preds_raw_vals = model.predict(test_pool, prediction_type='RawFormulaVal')
# # print("Class", preds_class)
# # print("Proba", preds_proba)
# # print("Raw", preds_raw_vals)

In [None]:
# train_pool = Pool(train.loc[train_index,features], 
#                   train.loc[train_index,target])

# # test_pool = Pool(test_data) 
# # specify training parameters via map

# param = {'iterations':5}