In [1]:
import os
import re

import numpy as np
import pandas as pd

In [36]:
DATA_DIR = '../DATA'
OUTPUT_DIR = '%s/Xy_internal_split_data' % DATA_DIR
FILE_FORMAT = r'input_data_[0-9]{4}\.csv$'
input_files = [f for f in os.listdir(DATA_DIR) 
               if re.match(FILE_FORMAT, f)]
input_files

['input_data_1998.csv']

In [30]:
file_path = '%s/%s' % (DATA_DIR, input_files[0])
year = file_path.split('_')[-1].replace('.csv', '')
print(year)
dat = pd.read_csv(file_path)
dat.head()

1998


Unnamed: 0,btl_t,x,y,lon,lat,etopo1,btl_t1,btl_t2,vgt,age,...,summerP2,Pmean,POctSep,PcumOctSep,PPT,drop0,drop5,ddAugJul,ddAugJun,year
0,0,2690000,-4510000,-77.292019,5.124395,67,0,0,0,0,...,3002.060034,434.518054,5915.816675,11886.133334,35787.400305,73,16,9612.244141,8780.226562,1998
1,0,2700000,-4510000,-77.208582,5.099891,61,0,0,0,0,...,3052.161845,447.904489,6051.102149,12214.262818,36553.805033,73,16,9715.162109,8873.277344,1998
2,0,2710000,-4510000,-77.125153,5.075297,67,0,0,0,0,...,3069.528098,455.863507,6158.877756,12404.550359,37061.989844,59,30,9726.484375,8884.317383,1998
3,0,2720000,-4510000,-77.041733,5.050615,26,0,0,0,0,...,3098.452174,466.844118,6302.817738,12666.490204,37755.201136,56,33,9827.635742,8975.438477,1998
4,0,2730000,-4510000,-76.958322,5.025843,43,0,0,0,0,...,3056.942266,468.097308,6329.933573,12694.101486,37835.465638,52,37,9743.973633,8900.458984,1998


In [4]:
print('X range: [%d, %d]' % (dat.x.min(), dat.x.max()))
print('Y range: [%d, %d]' % (dat.y.min(), dat.y.max()))

X range: [-5730000, 5000000]
Y range: [-4510000, 4480000]


In [5]:
def convert_data_to_0_base(data, cell_dim):
    data.x = ((data.x - min(data.x)) / cell_dim).astype(int)
    data.y = ((data.y - min(data.y)) / cell_dim).astype(int)
    return data

In [6]:
def get_proportional_internal_block(height, width, proportion):
    n = height * width
    n_block = int(round(n * proportion))
    #print('n total: %d, n block %d' % (n, n_block))
    #scaler = np.sqrt(n_block / (height * width))
    #h_block = int(round(height * scaler))
    #w_block = int(round(width * scaler))
    h_block = int(proportion * height)
    w_block = int(proportion * width)
    x_offset = (width - w_block) // 2
    y_offset = (height - h_block) // 2
    deficit = n_block - (h_block * w_block)
    x_range = [x_offset, x_offset + w_block]
    y_range = [y_offset, y_offset + h_block]
    return {'x_range': x_range, 'y_range': y_range, 'deficit': deficit}

In [7]:
def split_data_by_xy_ranges(dat, x_range, y_range):
    data = dat.copy()
    inner_data = data.loc[((data.x >= x_range[0])
                           & (data.x <  x_range[1])
                           & (data.y >= y_range[0])
                           & (data.y <  y_range[1])), :]
    outer_data = data.loc[((data.x <  x_range[0])
                           | (data.x >= x_range[1])
                           | (data.y <  y_range[0])
                           | (data.y >= y_range[1])), :]
    return inner_data, outer_data

In [8]:
def print_ranges(data):
        print(' x range: [%s, %s]\ty range: [%s, %s]'
              % (data.x.min(), data.x.max(), data.y.min(), data.y.max()))

In [9]:
def print_data_split(X_train, y_train, X_valid, y_valid, X_test, y_test):
    '''
    Prints dimensions of train, validation, and test data and response 
    variables, neatly formatted.
    
    Args: (All): DataFrame: the raw data for each of the sets
    Returns: None
    '''
    print('Data split into:\n X_train: %s  y_train: %s'
          % (X_train.shape, y_train.shape))
    print_ranges(X_train)
    print('\n X_valid: %s  y_valid: %s' % (X_valid.shape, y_valid.shape))
    print_ranges(X_valid)
    print('\n X_test:  %s  y_test:  %s' % (X_test.shape, y_test.shape))
    print_ranges(X_test)

In [40]:
def split_predictors_response(dat, response):
    data = dat.copy()
    y = pd.DataFrame(data.loc[:, response])
    X = data.drop(response, axis=1)
    return X, y

In [41]:
def split_data_internal(dat, response, proportions, cell_dim):
    '''
    Not intended to be called directly; will be called by split_data().
    See split_data() for documentation.
    '''
    data = dat.copy()
    train, valid, test = proportions
    n = data.shape[0]
    n_train = int(round(n * proportions[0]))
    n_valid = int(round(n * proportions[1]))
    n_test = n - n_train - n_valid
    data = convert_data_to_0_base(data, cell_dim)
    test_valid_box = get_proportional_internal_block(
        width=data.x.max(), height=data.y.max(), proportion=valid + test)
    x_offset = test_valid_box['x_range'][0]
    y_offset = test_valid_box['y_range'][0]
    test_box = get_proportional_internal_block(
        width=test_valid_box['x_range'][1] - test_valid_box['x_range'][0],
        height=test_valid_box['y_range'][1] - test_valid_box['y_range'][0],
        proportion=test / (test + valid))
    test_box['x_range'] += x_offset
    test_box['y_range'] += y_offset
    test_set, tv_set = split_data_by_xy_ranges(
        data, test_box['x_range'], test_box['y_range'])
    valid_set, train_set = split_data_by_xy_ranges(
        tv_set, test_valid_box['x_range'], test_valid_box['y_range'])
    X_train, y_train = split_predictors_response(train_set, response)
    X_valid, y_valid = split_predictors_response(valid_set, response)
    X_test,  y_test  = split_predictors_response(test_set,  response)

    #print_data_split(X_train, y_train, X_valid, y_valid, X_test, y_test)
    return [[X_train, y_train], [X_valid, y_valid], [X_test, y_test]]

In [42]:
c = 0.62
b = 0.22
a = 1 - b - c

split_data = split_data_internal(
    dat, 'btl_t', np.array([a, b, c]), cell_dim=10000)

In [43]:
[[X_test, y_test], [X_valid, y_valid], [X_train, y_train]] = split_data
print_data_split(X_train, y_train, X_valid, y_valid, X_test, y_test)

Data split into:
 X_train: (194820, 39)  y_train: (194820, 1)
 x range: [204, 868]	y range: [171, 727]

 X_valid: (41785, 39)  y_valid: (41785, 1)
 x range: [141, 986]	y range: [72, 826]

 X_test:  (41305, 39)  y_test:  (41305, 1)
 x range: [0, 1073]	y range: [0, 899]


In [44]:
type(y_train)

pandas.core.frame.DataFrame

In [45]:
X_train.head()

Unnamed: 0,x,y,lon,lat,etopo1,btl_t1,btl_t2,vgt,age,density,...,summerP2,Pmean,POctSep,PcumOctSep,PPT,drop0,drop5,ddAugJul,ddAugJun,year
38558,461,171,-110.753897,23.897786,37,0,0,0,0,3897.362549,...,260.140003,27.3025,300.776673,671.006675,883.836676,43,46,8429.333008,7625.910156,1998
38559,462,171,-110.658523,23.90971,68,0,0,0,0,3513.256348,...,297.770211,28.710834,317.206631,697.693745,975.010218,43,46,8420.001953,7615.243652,1998
38560,463,171,-110.563132,23.921529,125,0,0,0,0,4133.384277,...,345.010408,31.867925,355.08339,791.117443,1135.790492,43,46,8361.196289,7555.813477,1998
38561,464,171,-110.467724,23.933242,174,0,0,0,0,3205.625244,...,380.35265,33.705774,378.87803,854.480301,1224.445529,43,46,8351.758789,7541.419922,1998
38562,465,171,-110.372299,23.944849,96,0,0,0,0,4849.293457,...,385.795531,33.724199,383.239286,876.431076,1211.50791,42,47,8430.083984,7608.864258,1998


In [47]:
y_train.head()

Unnamed: 0,btl_t
38558,0
38559,0
38560,0
38561,0
38562,0


In [37]:
def save_files(year, split_data):
    if not os.path.exists(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)
    set_names = ['test', 'valid', 'train']
    xy_names = ['X', 'y']
    for data_set, set_name in zip(split_data, set_names):
        for xy, xy_name in zip(data_set, xy_names):
            path = '%s/%s_%s_%s.csv' % (OUTPUT_DIR, xy_name, set_name, year)
            print('Writing data to ', path)
            xy.to_csv(path, index=False)
    return True

In [38]:
save_files(year, split_data)

('Writing data to ', '../DATA/Xy_internal_split_data/X_test_1998.csv')
('Writing data to ', '../DATA/Xy_internal_split_data/y_test_1998.csv')
('Writing data to ', '../DATA/Xy_internal_split_data/X_valid_1998.csv')
('Writing data to ', '../DATA/Xy_internal_split_data/y_valid_1998.csv')
('Writing data to ', '../DATA/Xy_internal_split_data/X_train_1998.csv')
('Writing data to ', '../DATA/Xy_internal_split_data/y_train_1998.csv')
