In [1]:
import os

import pandas as pd

In [2]:
!ls ../../data/

README.rmd             [34mcluster[m[m                input_data_1998.csv
[34mXy_internal_split_data[m[m [34mdev[m[m


In [42]:
DATA_DIR = '../../data/Xy_internal_split_data'
train_X_files = sorted([f for f in os.listdir(DATA_DIR) if 'X_train' in f])
valid_X_files = sorted([f for f in os.listdir(DATA_DIR) if 'X_valid' in f])
test_X_files  = sorted([f for f in os.listdir(DATA_DIR) if 'X_test' in f])
train_y_files = sorted([f for f in os.listdir(DATA_DIR) if 'y_train' in f])
valid_y_files = sorted([f for f in os.listdir(DATA_DIR) if 'y_valid' in f])
test_y_files  = sorted([f for f in os.listdir(DATA_DIR) if 'y_test' in f])

print('Train:\n ', train_X_files, '\n ', train_y_files)
print('Valid:\n ', valid_X_files, '\n ', valid_y_files)
print('Test:\n ',  test_X_files,  '\n ', test_y_files)

Train:
  ['X_train_1998.csv'] 
  ['y_train_1998.csv']
Valid:
  ['X_valid_1998.csv'] 
  ['y_valid_1998.csv']
Test:
  ['X_test_1998.csv'] 
  ['y_test_1998.csv']


In [4]:
tr = pd.read_csv('%s/%s' % (DATA_DIR, train_files[0]))
vl = pd.read_csv('%s/%s' % (DATA_DIR, valid_files[0]))
ts = pd.read_csv('%s/%s' % (DATA_DIR, test_files[0]))

print(tr.shape, vl.shape, ts.shape)
out = tr.append(vl)
out = out.append(ts)
print(out.shape)

(194820, 39) (41785, 39) (41305, 39)
(277910, 39)


In [43]:
def load_data_set(set_files):
    print('Loading data from %s...' % set_files)
    data_set = pd.read_csv('%s/%s' % (DATA_DIR, set_files.pop()))
    for f in set_files:
        next_chunk = pd.read_csv('%s/%s' % (DATA_DIR, f))
        data_set = data_set.append(next_chunk)
    return data_set

In [44]:
X_train = load_data_set(train_X_files)
X_valid = load_data_set(valid_X_files)
X_test  = load_data_set(test_X_files)
y_train = load_data_set(train_y_files)
y_valid = load_data_set(valid_y_files)
y_test  = load_data_set(test_y_files)

data_sets = [[X_train, y_train], [X_valid, y_valid], [X_test, y_test]]

Loading data from ['X_train_1998.csv']...
Loading data from ['X_valid_1998.csv']...
Loading data from ['X_test_1998.csv']...
Loading data from ['y_train_1998.csv']...
Loading data from ['y_valid_1998.csv']...
Loading data from ['y_test_1998.csv']...


In [45]:
SQUARE = [
    'lon', 'lat', 'etopo1', 'age', 'density', 'JanTmin', 'MarTmin', 
    'TMarAug', 'summerTmean', 'AugTmean', 'AugTmax', 'GSP', 'PMarAug',
    'summerP0', 'OctTmin', 'fallTmean', 'winterTmin', 'Tmin', 'Tmean', 
    'Tvar', 'TOctSep', 'summerP1', 'summerP2', 'Pmean', 'POctSep', 
    'PcumOctSep', 'PPT', 'drop0', 'drop5', 'ddAugJul', 'ddAugJun']
CUBE = ['age', 'density', 'drop0', 'drop5']
INTERACTIONS = [
    'lon:TMarAug', 'lon:AugTmean', 'lon:AugTmax', 'lon:OctTmin',
    'lon:Tmean', 'lon:TOctSep', 'lat:ddAugJul', 'lat:ddAugJun', 
    'density:summerP0', 'density:summerP1', 'density:summerP2', 
    'JanTmin:summerTmean', 'JanTmin:AugTmean', 'JanTmin:AugTmax',
    'JanTmin:ddAugJul', 'JanTmin:ddAugJun', 'MarTmin:AugTmean', 
    'MarTmin:AugTmax', 'TMarAug:ddAugJul', 'TMarAug:ddAugJun', 
    'summerTmean:OctTmin', 'summerTmean:winterTmin', 'summerTmean:Tmin',
    'summerTmean:ddAugJul', 'summerTmean:ddAugJun', 'AugTmean:winterTmin',
    'AugTmean:ddAugJul', 'AugTmean:ddAugJun', 'AugTmax:ddAugJun',
    'GSP:summerP0', 'GSP:summerP1', 'GSP:summerP2', 'GSP:Pmean',
    'GSP:POctSep', 'GSP:PcumOctSep', 'GSP:PPT', 'PMarAug:summerP0',
    'PMarAug:summerP2', 'PMarAug:POctSep', 'PMarAug:PcumOctSep', 
    'PMarAug:PPT', 'OctTmin:ddAugJul', 'OctTmin:ddAugJun', 
    'fallTmean:ddAugJun', 'winterTmin:ddAugJul', 'winterTmin:ddAugJun',
    'Tmin:ddAugJun', 'summerP1:POctSep', 'summerP1:PcumOctSep', 
    'summerP1:PPT']

# check for typos in above
for term in SQUARE + CUBE + INTERACTIONS:
    for t in term.split(':'):
        if t not in list(train):
            print(t)

In [46]:
def add_squares(data_set):
    print('Adding quadratic terms...')
    for field in SQUARE:
        if field in list(data_set):
            data_set['%s_sq' % field] = data_set[field] ** 2
    return data_set

In [47]:
def add_cubes(data_set):
    print('Adding cubic terms...')    
    for field in CUBE:
        if field in list(data_set):
            data_set['%s_cub' % field] = data_set[field] ** 3
    return data_set

In [48]:
def add_interactions(data_set):
    print('Adding interactions...')
    for field in INTERACTIONS:
        f1, f2 = field.split(':')
        if f1 in data_set and f2 in data_set:
            data_set[field] = data_set[f1] * data_set[f2]
    return data_set

In [49]:
def add_all_cols(data_set):
    data_set = add_squares(data_set)
    data_set = add_cubes(data_set)
    data_set = add_interactions(data_set)
    return data_set

In [50]:
test = pd.DataFrame({'lon': [1, 2, 3], 
                     'lat': [4, 5, 6], 
                     'age': [-2, 2, 3], 
                     'Tmean': [0, 1, 5]})
test = add_all_cols(test)
test.head()

Adding quadratic terms...
Adding cubic terms...
Adding interactions...


Unnamed: 0,Tmean,age,lat,lon,lon_sq,lat_sq,age_sq,Tmean_sq,age_cub,lon:Tmean
0,0,-2,4,1,1,16,4,0,-8,0
1,1,2,5,2,4,25,4,1,8,2
2,5,3,6,3,9,36,9,25,27,15


In [53]:
def construct_model_matrices():
    train_X_files = sorted(
        [f for f in os.listdir(DATA_DIR) if 'X_train' in f])
    valid_X_files = sorted(
        [f for f in os.listdir(DATA_DIR) if 'X_valid' in f])
    test_X_files  = sorted(
        [f for f in os.listdir(DATA_DIR) if 'X_test' in f])
    train_y_files = sorted(
        [f for f in os.listdir(DATA_DIR) if 'y_train' in f])
    valid_y_files = sorted(
        [f for f in os.listdir(DATA_DIR) if 'y_valid' in f])
    test_y_files  = sorted(
        [f for f in os.listdir(DATA_DIR) if 'y_test' in f])
    print('Train:\n ', train_X_files, '\n ', train_y_files)
    print('Valid:\n ', valid_X_files, '\n ', valid_y_files)
    print('Test:\n ',  test_X_files,  '\n ', test_y_files)
    
    X_train = load_data_set(train_X_files)
    X_valid = load_data_set(valid_X_files)
    X_test  = load_data_set(test_X_files)
    y_train = load_data_set(train_y_files)
    y_valid = load_data_set(valid_y_files)
    y_test  = load_data_set(test_y_files)
    data_sets = [
        [X_train, y_train], [X_valid, y_valid], [X_test, y_test]]    
    
    for i, [X, y] in enumerate(data_sets):
        data_sets[i] = [add_all_cols(X), y]
    return data_sets

In [54]:
data_sets = construct_model_matrices()

Train:
  ['X_train_1998.csv'] 
  ['y_train_1998.csv']
Valid:
  ['X_valid_1998.csv'] 
  ['y_valid_1998.csv']
Test:
  ['X_test_1998.csv'] 
  ['y_test_1998.csv']
Loading data from ['X_train_1998.csv']...
Loading data from ['X_valid_1998.csv']...
Loading data from ['X_test_1998.csv']...
Loading data from ['y_train_1998.csv']...
Loading data from ['y_valid_1998.csv']...
Loading data from ['y_test_1998.csv']...
Adding quadratic terms...
Adding cubic terms...
Adding interactions...
Adding quadratic terms...
Adding cubic terms...
Adding interactions...
Adding quadratic terms...
Adding cubic terms...
Adding interactions...


In [58]:
# Check
#list(data_sets[0][0])
#list(data_sets[0][1])