In [1]:
# make SLDB datasets for BSCTRFM from individual time series

# first, use this code for trimmed datasets only
# later, generalize for all the time series in the electricity dataset

In [2]:
import numpy as np
import pandas as pd

In [3]:
import json

In [4]:
import os

In [5]:
import tensorflow as tf

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import joblib

In [7]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
from bokeh.layouts import row, gridplot, layout
from bokeh.palettes import d3
output_notebook()

In [8]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [9]:
# the main source is the electricity dataset LD2011-2014 from UCI

In [10]:
# it resides in
dataset_path = '/home/developer/gcp/cbidmltsf/datasets/electricity'

In [11]:
os.listdir(dataset_path)

['LD2011_2014.txt',
 'separated_preprocessed',
 'separated_raw',
 'hourly_electricity_complete.pkl',
 'hourly_electricity.csv',
 'LD2011_2014.txt.zip',
 'hourly_electricity_filtered_academic_papers.pkl']

In [12]:
# 'LD2011_2014.txt'                                          source from UCI
# 'LD2011_2014.txt.zip'                                      source from UCI, compressed
# 'hourly_electricity.csv'                                   complete dataset in CSV
# 'hourly_electricity_complete.pkl'                          complete dataset in Pandas
# 'hourly_electricity_filtered_academic_papers.pkl'          filtered dataset for benchmarking
# 'separated_raw/'                                           pickles per customer, raw data
# 'separated_preprocessed/'                                  pickles per customer, outliers removed

In [13]:
# a SLDB is produced from separated time series (raw or preprocessed)

# SLDB contents are:
# TFRecord files for training
# TFRecord files for evaluation (if eval required)
# time series pickles for testing

In [14]:
# constant values for positional encodings
hours_in_day = 24
days_in_week = 7
days_in_month = 30
days_in_year = 365

In [15]:
# a constant to make sin/cos functions from hours_from_start (the 'age' covariate)
total_hours = 32303

In [16]:
# define global dataset intervals (they might not be precise when missing values exist)

# split the time series in seen (train, eval) and unseen (test) data
# according to academic papers:

# 243 days on seen data, 7 days on unseen data 

# seen data:      '2014-01-01 00:00:00' to '2014-08-31 23:00:00', 243*24 = 5832 lectures

# train/eval split is 0.9/0.1, then

# train data:     '2014-01-01 00:00:00' to '2014-08-07 15:00:00', 5248 lectures
# eval data:      '2014-08-07 16:00:00' to '2014-08-31 23:00:00', 584 lectures

# unseen data:    '2014-09-01 00:00:00' to '2014-09-07 23:00:00', 7*24 = 168 lectures

dates = {
    'train': {
        'start': '2014-01-01 00:00:00',
        'end': '2014-08-07 15:00:00',
    },
    'eval': {
        'start': '2014-08-07 16:00:00',
        'end': '2014-08-31 23:00:00',
    },
    'test': {
        'start': '2014-09-01 00:00:00',
        'end': '2014-09-07 23:00:00',
    },
}

In [17]:
# build sub-series to be persisted as serialized training examples

# dimensionality of the encoder input
m = 168

# dimensionality of the decoder output 
t = 168

In [18]:
# columns to be included in the SLDB

# use 7D encoder (age, hour-day, day-week)

sldb_columns = [
    'date',
    'token_id',
    'kw_scaled',
    'sin_hours_from_start',
    'cos_hours_from_start',
    'sin_hour_day',
    'cos_hour_day',
    'sin_day_week',
    'cos_day_week',
    # 'sin_day_month',
    # 'cos_day_month',
    # 'sin_day_year',
    # 'cos_day_year'    
]

In [19]:
sldb = {
    'ts': 'LD2011-2014_SEPARATED_FULL',
    'embedding': {
        'hourly': 168
    },
    'tau': {
        'hourly': 1
    },
    'no_targets': 168,
    'BSCTRFM': 1,
    'preprocessed': 0
}

In [20]:
sldb

{'ts': 'LD2011-2014_SEPARATED_FULL',
 'embedding': {'hourly': 168},
 'tau': {'hourly': 1},
 'no_targets': 168,
 'BSCTRFM': 1,
 'preprocessed': 0}

In [21]:
# a string with the basic specifications of the SLDB, as part of the SLDB identifier

# add the suffix '11D' to differentiate this SLDB from the original one, which is 9D

# add the suffix MMX to indicate the scaler used was MinMax
# add the suffix STD to indicate the scaler used was Standard

sldb_specs = 'BSCTRFM_{:03d}_{:03d}_07DB_MMX'.format(sldb['embedding']['hourly'], sldb['no_targets'])
sldb_specs

'BSCTRFM_168_168_07DB_MMX'

In [22]:
# get the time-based identifier for the SLDB
sldb_identifier = '{}_{}'.format(sldb['ts'], sldb_specs)
sldb_identifier

'LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX'

In [23]:
sldb_dir = '/home/developer/gcp/cbidmltsf/sldbs/{}'.format(sldb_identifier)
sldb_dir

'/home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX'

In [24]:
# get a path to the scalers sub-directory
scalers_dir = '{}/scalers'.format(sldb_dir)
scalers_dir

'/home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX/scalers'

In [25]:
# DO NOT CREATE SLDB FOLDERS, THEY WERE CREATED BEFORE

In [26]:
encoder_input_columns = [
    'kw_scaled',
    'sin_hours_from_start',
    'cos_hours_from_start',
    'sin_hour_day',
    'cos_hour_day',
    'sin_day_week',
    'cos_day_week',
    # 'sin_day_month',
    # 'cos_day_month',
    # 'sin_day_year',
    # 'cos_day_year'
]

In [27]:
# both the encoder input and the decoder input use the same columns from the source sub_series dataframe
decoder_input_columns = encoder_input_columns

In [28]:
target_columns = ['kw_scaled']

In [29]:
id_columns = ['token_id']

In [30]:
# a dictionary to manage data per individual customer_id
data = dict()

In [31]:
# a dictionary to store the number of examples per customer_id, stage
count = dict()

In [32]:
# the number of cores available for training in Cloud TPU
num_cores = 8

In [33]:
# there are 21 datasets to be trimmed for SLDB production

token_ids = [66,
             106, 107, 108, 109, 110, 111, 112, 113, 115, 116,
             117, 120, 121, 122, 133, 160, 178, 181, 337, 347]

In [34]:
customer_ids = ['MT_{:03d}'.format(token_id) for token_id in token_ids]

In [35]:
# are we training over raw data or preprocessed data?
state = 'raw'

In [36]:
for customer_id in customer_ids:
    customer_data_path = '{}/separated_{}/{}.pkl'.format(dataset_path, state, customer_id)
    data[customer_id] = pd.read_pickle(customer_data_path)

In [37]:
len(data.keys())

21

In [38]:
# a dictionary for trimming dates per customer_id
train_start_date = {
    'MT_066': '2014-07-15 16:00:00',
    'MT_106': '2014-01-14 00:00:00',
    'MT_107': '2014-01-14 00:00:00',
    'MT_108': '2014-01-14 00:00:00',
    'MT_109': '2014-02-18 00:00:00',
    'MT_110': '2014-01-14 00:00:00',
    'MT_111': '2014-01-14 00:00:00',
    'MT_112': '2014-02-12 00:00:00',
    'MT_113': '2014-01-14 00:00:00',
    'MT_115': '2014-01-14 00:00:00',
    'MT_116': '2014-02-18 00:00:00',
    'MT_117': '2014-01-14 00:00:00',
    'MT_120': '2014-01-14 00:00:00',
    'MT_121': '2014-01-14 00:00:00',
    'MT_122': '2014-01-14 00:00:00',
    'MT_133': '2014-03-13 16:00:00',
    'MT_160': '2014-02-04 00:00:00',
    'MT_178': '2014-07-18 00:00:00',
    'MT_181': '2014-03-05 00:00:00',
    'MT_337': '2014-01-17 00:00:00',
    'MT_347': '2014-02-28 00:00:00',
}

In [51]:
for customer_id in customer_ids:

    # initialize the examples dictionary for each customer
    examples = {
        'train': [],
        'eval': [],
        # test dataset is not passed to SLDB
        # 'test': []
    }
    
    # use now a reference to the dataframe in the data dictionary 
    data_df = data[customer_id]
    
    # a sub-dictionary to keep the number of examples per customer_id, stage
    count[customer_id] = dict()

    # expand with positional encodings
    data_df['sin_hours_from_start'] = np.sin(2*np.pi*data_df.hours_from_start/total_hours)
    data_df['cos_hours_from_start'] = np.cos(2*np.pi*data_df.hours_from_start/total_hours)
    data_df['sin_hour_day'] = np.sin(2*np.pi*data_df.hour_of_day/hours_in_day)
    data_df['cos_hour_day'] = np.cos(2*np.pi*data_df.hour_of_day/hours_in_day)
    data_df['sin_day_week'] = np.sin(2*np.pi*data_df.day_of_week/days_in_week)
    data_df['cos_day_week'] = np.cos(2*np.pi*data_df.day_of_week/days_in_week)
    
    print('Added positional encodings to {}.'.format(customer_id))

    # get the time series indexes that delimit train, eval, and test intervals
    
    # train interval goes from the first available lecture (ideally '2014-01-01 00:00:00')
    # to '2014-08-07 15:00:00' (ideally 5248 lectures)
    train_start_index = data_df[data_df['date'] == pd.to_datetime(train_start_date[customer_id])].index[0]
    train_end_index = data_df[data_df['date'] == pd.to_datetime(dates['train']['end'])].index[0]
    print('{} train interval: from {} on {} to {} on {}, {} lectures'.\
         format(customer_id,
                train_start_index, train_start_date[customer_id],
                train_end_index, dates['train']['end'],
                train_end_index - train_start_index + 1))

    eval_start_index = data_df[data_df['date'] == pd.to_datetime(dates['eval']['start'])].index[0]
    eval_end_index = data_df[data_df['date'] == pd.to_datetime(dates['eval']['end'])].index[0]
    print('{} eval interval: from {} on {} to {} on {}, {} lectures'.\
         format(customer_id,
                eval_start_index, dates['eval']['start'],
                eval_end_index, dates['eval']['end'],
                eval_end_index - eval_start_index + 1))

    
    # get a series for the power usage variable on the training dataset, to fit the scaler
    # set up the upper limit of this series based on a fixed date, not on a fixed value!!!
    lectures_train_data = data_df['power_usage'].loc[train_start_index:train_end_index]

    # fit a scaler only on train data
    # it is required to pass the power usage time series to a (?, 1) NumPy array
    lectures_train_data_array = np.array(lectures_train_data).reshape(-1, 1)

    # get MinMaxScaler on train data, store it in a dictionary
    scaler_type = 'min_max'
    scaler = MinMaxScaler()
    fitted_scaler = scaler.fit(lectures_train_data_array)
    print('Scaler {} generated on training data for {}'.format(scaler_type, customer_id))
    
    # persist the scaler
    scaler_filename = '{}/{}_{}.save'.format(scalers_dir, scaler_type, customer_id)
    joblib.dump(fitted_scaler, scaler_filename)
    print('Scaler {} persisted for {}'.format(scaler_type, customer_id))

    # get an array from the variable time series (seen and unseen)
    all_data_variable_array = np.array(data_df.power_usage).reshape(-1, 1)

    # apply the scaler over all data (seen and unseen)
    # rescale, and squeeze to drop the extra dimension, then assign to the new column kw_scaled
    data_df['kw_scaled'] = np.squeeze(fitted_scaler.transform(all_data_variable_array))

    # at this moment, the individual time series are ready to be window-rolled to produce
    # sub-series/examples to serialize

    # BSCTRFM inference process is not direct, but iterative, therefore
    # no TFRecord SLDB is required for test dataset,

    test_start_index = data_df[data_df['date'] == pd.to_datetime(dates['test']['start'])].index[0]
    test_end_index = data_df[data_df['date'] == pd.to_datetime(dates['test']['end'])].index[0]
    
    # the time series used to build the test dataset must go
    # from '2014-08-18 01:00:00' to '2014-09-07 23:00:00'
    # in order to extract 168 features with targets
    # (the last element in the decoder output)
    # ranging from '2014-09-01 00:00:00' to '2014-09-07 23:00:00'

    # therefore
    test_ts_start_index = data_df[data_df['date'] == pd.to_datetime('2014-08-18 01:00:00')].index[0]
    
    # persist only the time series corresponding to the inference interval as test dataset
    test_time_series = data_df[sldb_columns].loc[test_ts_start_index:test_end_index]

    print('{} test interval: from {} on {} to {} on {}, {} lectures'.\
         format(customer_id,
                test_ts_start_index, '2014-08-18 01:00:00',
                test_end_index, dates['test']['end'],
                test_end_index - test_ts_start_index + 1))

    # path to persist the time series dataframe corresponding to test dataset
    path = '{}/test/{}.pkl'.format(sldb_dir, customer_id)

    test_time_series.to_pickle(path)
    print('Test dataset persisted as a time series pickle for {}'.format(customer_id))
    
    # make SLDB training dataset
    # get an iterable with all the possible sub-series for training examples
    train_starting_indexes = np.arange(train_start_index, train_end_index - (m + t) + 2)
    
    for train_starting_index in train_starting_indexes:
        
        # substract 1 at the end of the slice because loc works different from direct slicing!
        sub_series_df = data_df[sldb_columns].loc[train_starting_index:train_starting_index + (m + t) - 1]
        
        encoder_input_df = sub_series_df[encoder_input_columns][:m]
        decoder_input_df = sub_series_df[decoder_input_columns][m-1:m-1+t]
        target_df = sub_series_df[target_columns][m:m+t]
        id_df = sub_series_df[id_columns][:1]
        
        encoder_input_list = encoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
        decoder_input_list = decoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
        target_list = target_df.reset_index(drop=True).to_numpy().flatten().tolist()
        id_list = id_df.reset_index(drop=True).to_numpy().flatten().tolist()
        
        examples['train'].append(
            {
                'encoder_input': encoder_input_list,
                'decoder_input': decoder_input_list,
                'target': target_list,
                'id': id_list,
            }
        )

    print('{} processed. The number of examples in {} dataset is {}'.\
          format(customer_id, 'train', len(examples['train'])))
    
    
    # make SLDB evaluation dataset
    # get an iterable with all the possible sub-series for evaluation examples
    eval_starting_indexes = np.arange(eval_start_index, eval_end_index - (m + t) + 2)
    
    for eval_starting_index in eval_starting_indexes:
        
        # substract 1 at the end of the slice because loc works different from direct slicing!
        sub_series_df = data_df[sldb_columns].loc[eval_starting_index:eval_starting_index + (m + t) - 1]
        
        encoder_input_df = sub_series_df[encoder_input_columns][:m]
        decoder_input_df = sub_series_df[decoder_input_columns][m-1:m-1+t]
        target_df = sub_series_df[target_columns][m:m+t]
        id_df = sub_series_df[id_columns][:1]
        
        encoder_input_list = encoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
        decoder_input_list = decoder_input_df.reset_index(drop=True).to_numpy().flatten().tolist()
        target_list = target_df.reset_index(drop=True).to_numpy().flatten().tolist()
        id_list = id_df.reset_index(drop=True).to_numpy().flatten().tolist()
        
        examples['eval'].append(
            {
                'encoder_input': encoder_input_list,
                'decoder_input': decoder_input_list,
                'target': target_list,
                'id': id_list,
            }
        )
        
    print('{} processed. The number of examples in {} dataset is {}'.\
          format(customer_id, 'eval', len(examples['eval'])))
   
    # DO NOT PRODUCE A TEST DATASET FOR SLDB, AS INFERENCE PROCESS IS NOT DIRECT
    # (IT IS ITERATIVE OVER UNSEEN DATA TIME SERIES, ALREADY PERSISTED AS A PICKLE FILE)

    # on each customer dataset, adjust the number of examples to the number of training cores
    for stage in ['train', 'eval']:
        # how many examples/rows must be removed from examples[stage] to comply with the number of cores
        examples_to_remove = len(examples[stage])%num_cores

        # remove the last 'examples_to_remove' examples from the dataset
        for _ in np.arange(examples_to_remove):
            examples[stage].pop(-1)

        
        # keep a record of the number of training and evaluation examples
        count[customer_id][stage] = len(examples[stage])
        print('For {} cores in Cloud TPU, the number of {} examples for {} was adjusted to {}'.\
             format(num_cores, stage, customer_id, len(examples[stage])))

        
    # serialize the rows in examples['train'] and, if present, examples['eval']
    # process each customer, then release data structures to avoid excesive memory consumption

    # write a TFRecord file for each consumer_id/stage
    for stage in ['train', 'eval']:
        # N_ROWS = sldb['stats'][stage]['n_rows']
        N_ROWS = len(examples[stage])
        filename = '{}/{}/{}.tfrecord'.format(sldb_dir, stage, customer_id)

        with tf.io.TFRecordWriter(filename) as writer:
            for row in np.arange(N_ROWS):

                example = tf.train.Example(
                    # features within the example
                    features=tf.train.Features(
                        # individual feature definition
                        feature={'encoder_input':
                                 _float_feature_from_list_of_values(
                                     examples[stage][row]['encoder_input']),
                                 'decoder_input':
                                 _float_feature_from_list_of_values(
                                     examples[stage][row]['decoder_input']),
                                 'target':
                                 _float_feature_from_list_of_values(
                                     examples[stage][row]['target']),
                                 'id':
                                 _float_feature_from_list_of_values(
                                     examples[stage][row]['id'])
                                 }
                    )
                )
                serialized_example = example.SerializeToString()
                writer.write(serialized_example)

            # report TFRecord file as completed
            print('Persisted {} TFRecord file for {}'.format(stage, customer_id))


Added positional encodings to MT_066.
MT_066 train interval: from 1648317 on 2014-07-15 16:00:00 to 1648868 on 2014-08-07 15:00:00, 552 lectures
MT_066 eval interval: from 1648869 on 2014-08-07 16:00:00 to 1649452 on 2014-08-31 23:00:00, 584 lectures
Scaler min_max generated on training data for MT_066
Scaler min_max persisted for MT_066
MT_066 test interval: from 1649118 on 2014-08-18 01:00:00 to 1649620 on 2014-09-07 23:00:00, 503 lectures
Test dataset persisted as a time series pickle for MT_066
MT_066 processed. The number of examples in train dataset is 217
MT_066 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_066 was adjusted to 216
For 8 cores in Cloud TPU, the number of eval examples for MT_066 was adjusted to 248
Persisted train TFRecord file for MT_066
Persisted eval TFRecord file for MT_066
Added positional encodings to MT_106.
MT_106 train interval: from 2668763 on 2014-01-14 00:00:00 to 2673698 on 2014

MT_115 processed. The number of examples in train dataset is 4601
MT_115 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_115 was adjusted to 4600
For 8 cores in Cloud TPU, the number of eval examples for MT_115 was adjusted to 248
Persisted train TFRecord file for MT_115
Persisted eval TFRecord file for MT_115
Added positional encodings to MT_116.
MT_116 train interval: from 2769573 on 2014-02-18 00:00:00 to 2773668 on 2014-08-07 15:00:00, 4096 lectures
MT_116 eval interval: from 2773669 on 2014-08-07 16:00:00 to 2774252 on 2014-08-31 23:00:00, 584 lectures
Scaler min_max generated on training data for MT_116
Scaler min_max persisted for MT_116
MT_116 test interval: from 2773918 on 2014-08-18 01:00:00 to 2774420 on 2014-09-07 23:00:00, 503 lectures
Test dataset persisted as a time series pickle for MT_116
MT_116 processed. The number of examples in train dataset is 3761
MT_116 processed. The number of examples in ev

Persisted train TFRecord file for MT_181
Persisted eval TFRecord file for MT_181
Added positional encodings to MT_337.
MT_337 train interval: from 9596915 on 2014-01-17 00:00:00 to 9601778 on 2014-08-07 15:00:00, 4864 lectures
MT_337 eval interval: from 9601779 on 2014-08-07 16:00:00 to 9602362 on 2014-08-31 23:00:00, 584 lectures
Scaler min_max generated on training data for MT_337
Scaler min_max persisted for MT_337
MT_337 test interval: from 9602028 on 2014-08-18 01:00:00 to 9602530 on 2014-09-07 23:00:00, 503 lectures
Test dataset persisted as a time series pickle for MT_337
MT_337 processed. The number of examples in train dataset is 4529
MT_337 processed. The number of examples in eval dataset is 249
For 8 cores in Cloud TPU, the number of train examples for MT_337 was adjusted to 4528
For 8 cores in Cloud TPU, the number of eval examples for MT_337 was adjusted to 248
Persisted train TFRecord file for MT_337
Persisted eval TFRecord file for MT_337
Added positional encodings to M

In [52]:
count

{'MT_106': {'train': 4600, 'eval': 248},
 'MT_066': {'train': 216, 'eval': 248},
 'MT_107': {'train': 4600, 'eval': 248},
 'MT_108': {'train': 4600, 'eval': 248},
 'MT_109': {'train': 3760, 'eval': 248},
 'MT_110': {'train': 4600, 'eval': 248},
 'MT_111': {'train': 4600, 'eval': 248},
 'MT_112': {'train': 3904, 'eval': 248},
 'MT_113': {'train': 4600, 'eval': 248},
 'MT_115': {'train': 4600, 'eval': 248},
 'MT_116': {'train': 3760, 'eval': 248},
 'MT_117': {'train': 4600, 'eval': 248},
 'MT_120': {'train': 4600, 'eval': 248},
 'MT_121': {'train': 4600, 'eval': 248},
 'MT_122': {'train': 4600, 'eval': 248},
 'MT_133': {'train': 3192, 'eval': 248},
 'MT_160': {'train': 4096, 'eval': 248},
 'MT_178': {'train': 160, 'eval': 248},
 'MT_181': {'train': 3400, 'eval': 248},
 'MT_337': {'train': 4528, 'eval': 248},
 'MT_347': {'train': 3520, 'eval': 248}}

In [53]:
# a dataframe to keep track of training examples count

In [56]:
buffer_list = list()

for customer_id in count.keys():
    buffer_list.append([customer_id, count[customer_id]['train']])

In [59]:
trimmed_df = pd.DataFrame(buffer_list, columns=['customer_id', 'train'])

In [62]:
trimmed_df = trimmed_df.set_index('customer_id')

In [65]:
trimmed_df = trimmed_df.sort_index()

In [76]:
all_customer_ids = ['MT_{:03d}'.format(token_id) for token_id in np.arange(1, 370 + 1)]

In [77]:
irregular_token_ids = [
    66, 106, 107, 108, 109, 110, 111, 112, 113, 115, 116,
    117, 120, 121, 122, 133, 160, 178, 181, 223, 337, 347
]

In [78]:
irregular_customer_ids = ['MT_{:03d}'.format(token_id) for token_id in irregular_token_ids]

In [80]:
regular_customer_ids = list(set(all_customer_ids).difference(set(irregular_customer_ids)))

In [82]:
len(regular_customer_ids)

348

In [83]:
buffer_list = list()

for customer_id in regular_customer_ids:
    buffer_list.append([customer_id, 4912])

In [85]:
regular_df = pd.DataFrame(buffer_list, columns=['customer_id', 'train'])

In [88]:
regular_df = regular_df.set_index('customer_id')

In [89]:
regular_df = regular_df.sort_index()

In [90]:
regular_df

Unnamed: 0_level_0,train
customer_id,Unnamed: 1_level_1
MT_001,4912
MT_002,4912
MT_003,4912
MT_004,4912
MT_005,4912
...,...
MT_366,4912
MT_367,4912
MT_368,4912
MT_369,4912


In [92]:
trainable_df = pd.concat([trimmed_df, regular_df])

In [120]:
pd.set_option('display.max_rows', 200)

In [105]:
trainable_df = trainable_df.sort_index()

In [121]:
trainable_df

Unnamed: 0_level_0,train
customer_id,Unnamed: 1_level_1
MT_001,4912
MT_002,4912
MT_003,4912
MT_004,4912
MT_005,4912
...,...
MT_366,4912
MT_367,4912
MT_368,4912
MT_369,4912


In [117]:
np.sum(trainable_df.loc['MT_001':'MT_370']['train']) == np.sum(regular_df['train']) + np.sum(trimmed_df['train'])

True

In [115]:
# persist the dataframe
trainable_df.to_pickle('/home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX/example_count.pkl')

In [8]:
trainable_df = pd.read_pickle(
    '/home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX/example_count.pkl'
)

In [9]:
pd.set_option('display.max_rows', 400)

In [10]:
trainable_df

Unnamed: 0_level_0,train
customer_id,Unnamed: 1_level_1
MT_001,4912
MT_002,4912
MT_003,4912
MT_004,4912
MT_005,4912
MT_006,4912
MT_007,4912
MT_008,4912
MT_009,4912
MT_010,4912


In [11]:
np.sum(trainable_df.loc['MT_001':'MT_100']['train'])

486504

In [12]:
np.sum(trainable_df.loc['MT_001':'MT_200']['train'])

962160

In [13]:
np.sum(trainable_df.loc['MT_001':'MT_370']['train'])

1790512