In [None]:
# !pip install catboost
# !pip install lightgbm
# !pip install feather-format

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
import random
import gc
import lightgbm as lgb
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import feather
import os
import glob
import csv   
from sklearn.metrics import mean_squared_error

Set seed

In [5]:
seed = 7
random.seed(seed)
np.random.seed(seed)

# Load data

In [10]:
%%time
path = '../data'
train = feather.read_dataframe(f'{path}/Prepared_data/train_prepared.feather')
# train['timestamp'] = pd.to_datetime(train['timestamp'])
train['meter_reading_log'] = np.log1p(train['meter_reading'])
train.drop(columns=['utc','altitude','azimuth'], inplace=True)
gc.collect()

train = train[(train['suspicious_1'] + train['suspicious_2']).eq(0)]
train.drop(columns=['suspicious_1','suspicious_2'],inplace=True)
gc.collect()

train['meter_median'] = train.groupby(['building_id','meter'])['meter_reading_log'].transform('median')
train['meter_min'] = train.groupby(['building_id','meter'])['meter_reading_log'].transform('min')
train['meter_max'] = train.groupby(['building_id','meter'])['meter_reading_log'].transform('max')
train['meter_std'] = train.groupby(['building_id','meter'])['meter_reading_log'].transform('std')

train['hour'] = train['timestamp'].dt.hour

train['meter_hour_median'] = train.groupby(['building_id','meter','hour'])['meter_reading_log'].transform('median')
train['meter_hour_min'] = train.groupby(['building_id','meter','hour'])['meter_reading_log'].transform('min')
train['meter_hour_max'] = train.groupby(['building_id','meter','hour'])['meter_reading_log'].transform('max')
train['meter_hour_std'] = train.groupby(['building_id','meter','hour'])['meter_reading_log'].transform('std')

%%time
# train.reset_index(drop=True).to_feather(f'../../data-vol-3/Prepared_data/train_prepared_subset.feather')

In [7]:
# %%time
# test = feather.read_dataframe(f'../../data-vol-3/Prepared_data/test_leaked_prepared.feather')
# test.drop(columns=['utc','utc_time','altitude','azimuth'], inplace=True)
# test.dropna(subset=['meter_reading'],inplace=True)
# test = test[test['meter_reading']!=0].reset_index(drop=True)
# gc.collect()

In [8]:
# %%time
# test = feather.read_dataframe(f"{path}/Prepared_data/test_prepared.feather")
# test = test.set_index('row_id')
# test = test.sort_index()
# gc.collect()

# leaked = feather.read_dataframe(f"../../data-vol-3/Leaked_data/leaked_submission.feather")
# test = test.iloc[leaked['row_id'].values]
# test['meter_reading'] = leaked['meter_reading']
# test['meter_reading_log'] = np.log1p(test['meter_reading'])
# test.reset_index(inplace=True)

# del leaked
# gc.collect()
# test.to_feather(f'../../data-vol-3/Prepared_data/test_leaked_prepared.feather')

# Create train test split

Features

In [None]:
features = ['building_id',
             'site_id',
             'primary_use',
             'square_feet',
             'floor_count',
             'new_floor_count',
             'is_holiday',
             'weekday_cos',
             'weekday_sin',
             'weekday',
             'hour_cos',
             'hour_sin',
             'year_cos',
             'year_sin',
             'is_day_saving',
             'air_temperature',
             'age',
             'dew_temperature',
             'sea_level_pressure',
             'new_dew_temperature',
             'new_air_temperature',
             'humidity',
             'latitude',
             'irradiance',
             'feels_like',
             'new_feels_like',
             'radiation',
             'air_temperature_mean_lag72',
             'air_temperature_max_lag72',
             'air_temperature_min_lag72',
             'air_temperature_std_lag72',
             'cloudCover_mean_lag72',
             'cloudCover_std_lag72',
             'meter_median',
             'meter_max',
             'meter_std',
             'meter_hour_median',
             'meter_hour_min',
             'meter_hour_max',
             'meter_hour_std'
           ]
#              'new_wind_speed',
#              'new_sea_level_pressure',
#              'new_wind_direction_sin'
#              'new_wind_direction_cos',
#              'meter_min',
#              'meter',
#              'wind_speed',
#              'cloudCover',
#              'longitude',
#              'irradiance_cloud',
#              'cloudCover_max_lag72',
#              'wind_direction_cos',
#              'wind_direction_sin',
#              'new_year_built',
#              'cloud_coverage',
#              'precip_depth_1_hr',
#              'cloudCover_min_lag72',
#              'new_precip_depth_1_hr',
#              'square_feet_log',
#              'beaufort_scale',
#              'suspicious_1',
#              'suspicious_2',
#              'year_built',
#              'meter_reading',  
#              'timestamp',
#              'night_time',
#              'morning',
#              'afternoon',
#              'evening',
#              'new_cloud_coverage',
#              'uvIndex',
#              'meter_reading_log',
#              'split']


target = 'meter_reading_log'

In [None]:
for f in ['primary_use','is_day_saving','is_holiday','building_id','meter','site_id']:
    train[f] = train[f].astype('category')

# Train model

In [None]:
n_split = 5
version = 35
model_name = 'lgbm'

#### Create CV 

In [None]:
cv_method = 'stratified_site_id_week'
train['split'] = (train['site_id'].astype(int) + train['utc_time'].dt.weekofyear - 1) % n_split

# LightGBM

In [None]:
lgb_params =  {
    'objective': 'regression',
#      'boosting_type': 'goss',
#      'top_rate':0.5,
#      'other_rate':0.2,
     'boosting_type': 'gbdt',
     'metric': 'rmse',
     'n_jobs': -1,
     'learning_rate': 0.05,
     'num_leaves': 50,
     'max_depth': 10,
     'tree_learner': 'serial',
     'subsample_freq': 1,
     'subsample': 0.5,
     'colsample_bytree': 0.5,
     'max_bin': 50,
     'verbose': 2,
     'n_estimators':1000,
     'seed': seed,
#      'cat_l2':0.05,
     'lambda_l2':0.1,
     'early_stopping_rounds':100 }



mean_performance_test = 0 
mean_performance_train = 0
mean_performance_leaked = 0
gc.collect()

In [None]:
for meter in range(4):
    print(f"\n===== Meter {meter} =====")
    for fold in range(5):  
        print(f"\n===== Fold numer {fold} =====\n")
        
        index_train = train['split'].ne(fold) & train['meter'].eq(meter)
        tr_data = lgb.Dataset(train.loc[index_train][features], 
                              label=train.loc[index_train][target])
        
        index_vl = train['split'].eq(fold) & train['meter'].eq(meter)
        vl_data = lgb.Dataset(train.loc[index_vl][features], 
                              label=train.loc[index_vl][target])
        
        print(index_vl.sum(),index_train.sum())
        del index_train,index_vl
        gc.collect()
        
        estimator = lgb.train(
                    lgb_params,
                    tr_data,
                    valid_sets = [tr_data,vl_data],
                    verbose_eval = 50)
        
        model_filename = f'models/{model_name}_meter_{meter}_v_{version}_{cv_method}_fold_{fold}.bin'
        
        pickle.dump(estimator, open(model_filename, 'wb'))

        performance_test = estimator.best_score['valid_1']['rmse']
        performance_train = estimator.best_score['training']['rmse']

        mean_performance_test += performance_test
        mean_performance_train += performance_train

    #     i = 0
    #     step = 100000
    #     test['pred'] = np.nan
    #     while i < test.shape[0]:
    #         test.loc[i:(i+step-1),'pred'] = estimator.predict(test.iloc[i:(i+step)][features])
    #         i = i + step
    #         gc.collect()
    #     print("Done with the test prediction")

    #     performance_leaked = np.sqrt(mean_squared_error(test['meter_reading_log'],test['pred']))
    #     mean_performance_leaked += performance_leaked
    #     print((mean_performance_train/(fold+1)),(mean_performance_test/(fold+1)),(mean_performance_leaked/(fold+1)))

        with open(r'./models/perf_leaked.csv', 'a') as f:
            writer = csv.writer(f)
            writer.writerow([model_filename, fold, performance_train, performance_test, 0, version])

        print((mean_performance_train/(fold+1)),(mean_performance_test/(fold+1)))

In [None]:
mean_performance_leaked = mean_performance_leaked/n_split
mean_performance_test = mean_performance_test/n_split
mean_performance_train = mean_performance_train/n_split

with open(r'./models/perf_leaked.csv', 'a') as f:
    writer = csv.writer(f)
    writer.writerow([model_filename, 'avg', mean_performance_train, mean_performance_test, 0, version])

# Feature importance

In [None]:
feature_imp = pd.DataFrame({'Feature':estimator.feature_name()})

for meter in range(4):
    for i in range(n_split):
        m = f'models/{model_name}_meter_{meter}_v_{version}_{cv_method}_fold_{i}.bin'
        model = pickle.load(open(m,'rb'))
        importance = f'Importance_{i}_{meter}'
        feature_imp[importance] = model.feature_importance()
        feature_imp[importance] = feature_imp[importance]*100/feature_imp[importance].max()

feature_imp = feature_imp.set_index('Feature').stack().reset_index()
feature_imp['meter'] = feature_imp['level_1'].apply(lambda x: x.split('_')[2])
feature_imp['fold'] = feature_imp['level_1'].apply(lambda x: x.split('_')[1])

In [None]:
s = feature_imp.groupby(['meter','Feature'])[0].agg(['mean','max'])
s = s.reset_index()
s = s.pivot(index='Feature',values='mean', columns='meter')
s = s.sort_values('3',ascending=False)

In [None]:
s.plot(figsize=(40, 20), kind='bar')
plt.title('LightGBM Features')
plt.show()

In [None]:
plt.figure(figsize=(40, 20))
sns.set(font_scale = 3)
sns.barplot(x="max", y="Feature", data=feature_imp.sort_values(by="max", 
                                                    ascending=False).reset_index(), color='C0')

plt.title('LightGBM Features')
plt.tight_layout()
plt.show()

In [None]:
feature_imp.to_csv(f'./models/feature_importance_v_{version}.csv')

# Retrain with all data

In [None]:
for m in range(4):
    
    tr_data = lgb.Dataset(train[train['meter'].eq(m)][features],
                          label=train[train['meter'].eq(m)][target])

    gc.collect()
    estimator = lgb.train(
                lgb_params,
                tr_data,
                valid_sets = [tr_data],
                verbose_eval = 50)

    model_filename = f'models/{model_name}_meter_{m}_v_{version}_all.bin'

    pickle.dump(estimator, open(model_filename, 'wb'))

    gc.collect()