In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

In [2]:
train_df = pd.read_pickle("../data/processed/train_data.pkl")
label = np.log1p(train_df["meter_reading"])
del train_df["meter_reading"]

In [3]:
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=3)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    print("X_train:", X_train, "X_test:", X_test)
    y_train, y_test = y[train_index], y[test_index]
    print("y_train:", y_train, "y_test:", y_test)

TRAIN: [2 3] TEST: [0 1]
X_train: [[5 6]
 [7 8]] X_test: [[1 2]
 [3 4]]
y_train: [3 4] y_test: [1 2]
TRAIN: [0 1 3] TEST: [2]
X_train: [[1 2]
 [3 4]
 [7 8]] X_test: [[5 6]]
y_train: [1 2 4] y_test: [3]
TRAIN: [0 1 2] TEST: [3]
X_train: [[1 2]
 [3 4]
 [5 6]] X_test: [[7 8]]
y_train: [1 2 3] y_test: [4]


In [4]:
list(kf.split(train_df))[0]

(array([ 6738700,  6738701,  6738702, ..., 20216097, 20216098, 20216099]),
 array([      0,       1,       2, ..., 6738697, 6738698, 6738699]))

In [5]:
indices = np.arange(len(train_df.index))

In [6]:
indices[indices%2==0]

array([       0,        2,        4, ..., 20216094, 20216096, 20216098])

In [7]:
indices[indices%2!=0]

array([       1,        3,        5, ..., 20216095, 20216097, 20216099])

In [8]:
splits = 3
for i in range(splits):
    train_indices = indices[indices % splits != i]
    test_indices = indices[indices % splits == i]
    print("train_indices:",train_indices)
    print("test_indices:",test_indices)

train_indices: [       1        2        4 ... 20216096 20216098 20216099]
test_indices: [       0        3        6 ... 20216091 20216094 20216097]
train_indices: [       0        2        3 ... 20216096 20216097 20216099]
test_indices: [       1        4        7 ... 20216092 20216095 20216098]
train_indices: [       0        1        3 ... 20216095 20216097 20216098]
test_indices: [       2        5        8 ... 20216093 20216096 20216099]


In [9]:
train_df.iloc[train_indices]

Unnamed: 0,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,hour,weekday,month
0,0,0,2016-01-01 00:00:00,0,0,8.913685,11.0,,25.000000,6.0,20.000000,,0,4,1
1,1104,1,2016-01-01 00:00:00,13,9,12.204687,,,-8.296875,8.0,-12.203125,,0,4,1
3,1103,0,2016-01-01 00:00:00,13,7,10.081801,,,-8.296875,8.0,-12.203125,,0,4,1
4,1102,2,2016-01-01 00:00:00,13,6,11.375822,,,-8.296875,8.0,-12.203125,,0,4,1
6,1101,1,2016-01-01 00:00:00,13,6,11.183546,,,-8.296875,8.0,-12.203125,,0,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20216092,601,0,2016-12-31 23:00:00,4,1,9.385218,116.0,2.0,12.796875,2.0,3.300781,0.0,23,5,12
20216094,603,0,2016-12-31 23:00:00,4,9,10.277256,15.0,3.0,12.796875,2.0,3.300781,0.0,23,5,12
20216095,604,0,2016-12-31 23:00:00,4,0,9.043577,,3.0,12.796875,2.0,3.300781,0.0,23,5,12
20216097,606,0,2016-12-31 23:00:00,4,14,10.061901,20.0,2.0,12.796875,2.0,3.300781,0.0,23,5,12


In [10]:
train_df[train_df["timestamp"].dt.dayofyear % splits == 0]

Unnamed: 0,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,hour,weekday,month
110280,1097,1,2016-01-03 00:00:00,13,6,11.967473,,,-2.800781,0.0,-8.296875,0.0,0,6,1
110281,1097,2,2016-01-03 00:00:00,13,6,11.967473,,,-2.800781,0.0,-8.296875,0.0,0,6,1
110282,1098,0,2016-01-03 00:00:00,13,6,11.788631,,,-2.800781,0.0,-8.296875,0.0,0,6,1
110283,1098,1,2016-01-03 00:00:00,13,6,11.788631,,,-2.800781,0.0,-8.296875,0.0,0,6,1
110284,1098,2,2016-01-03 00:00:00,13,6,11.788631,,,-2.800781,0.0,-8.296875,0.0,0,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20216095,604,0,2016-12-31 23:00:00,4,0,9.043577,,3.0,12.796875,2.0,3.300781,0.0,23,5,12
20216096,605,0,2016-12-31 23:00:00,4,0,10.839993,95.0,5.0,12.796875,2.0,3.300781,0.0,23,5,12
20216097,606,0,2016-12-31 23:00:00,4,14,10.061901,20.0,2.0,12.796875,2.0,3.300781,0.0,23,5,12
20216098,590,0,2016-12-31 23:00:00,4,0,9.049819,99.0,2.0,12.796875,2.0,3.300781,0.0,23,5,12


In [11]:
train_df[train_df["timestamp"].dt.dayofyear % splits == 0].index

Int64Index([  110280,   110281,   110282,   110283,   110284,   110285,
              110286,   110287,   110288,   110289,
            ...
            20216090, 20216091, 20216092, 20216093, 20216094, 20216095,
            20216096, 20216097, 20216098, 20216099],
           dtype='int64', length=6740037)

In [12]:
indices = []
for i in range(splits):
    train_indices = train_df[train_df["timestamp"].dt.dayofyear % splits != i].index
    test_indices = train_df[train_df["timestamp"].dt.dayofyear % splits == i].index
    indices.append((train_indices, test_indices))
indices

[(Int64Index([       0,        1,        2,        3,        4,        5,
                     6,        7,        8,        9,
              ...
              20159379, 20159380, 20159381, 20159382, 20159383, 20159384,
              20159385, 20159386, 20159387, 20159388],
             dtype='int64', length=13476063),
  Int64Index([  110280,   110281,   110282,   110283,   110284,   110285,
                110286,   110287,   110288,   110289,
              ...
              20216090, 20216091, 20216092, 20216093, 20216094, 20216095,
              20216096, 20216097, 20216098, 20216099],
             dtype='int64', length=6740037)),
 (Int64Index([   55121,    55122,    55123,    55124,    55125,    55126,
                 55127,    55128,    55129,    55130,
              ...
              20216090, 20216091, 20216092, 20216093, 20216094, 20216095,
              20216096, 20216097, 20216098, 20216099],
             dtype='int64', length=13478638),
  Int64Index([       0,        1,    

In [13]:
for (train_indices, test_indices) in indices:
    print(len(train_indices))
    print(len(test_indices))

13476063
6740037
13478638
6737462
13477499
6738601
