In [1]:
import os
import sys
import pandas as pd

sys.path.append('../models')

import model_utils as util

In [2]:
DATA_DIR = '../../data/cluster/year/'
START_YEAR = 2000
END_YEAR = 2013 # Cannot add next year's data to 2014, so 2013 is last here

In [3]:
[[X_train, y_train], 
 [X_valid, y_valid], 
 [X_test, y_test]] = util.load_data(DATA_DIR)

X_train: (529623, 24)
X_valid: (176541, 24)
X_test: (176541, 24)
y_train: (529623, 1)
y_valid: (176541, 1)
y_test: (176541, 1)


In [6]:
data = X_train.append(X_valid).append(X_test)
y = y_train.append(y_valid).append(y_test)
data['beetle'] = y

(882705, 24)
(882705, 25)


In [7]:
def make_yearly_data(data, year):
    dat = data.copy()
    year_data = dat.loc[dat.year == year, :]
    y_year = pd.DataFrame(year_data['beetle'])
    X_year = year_data.drop(['beetle'], axis=1)
    return X_year, y_year

In [24]:
def merge_sets(data_sets):
    X, y = data_sets[0]
    for i in range(1, len(data_sets)):
        next_X, next_y = data_sets[i]
        X = X.append(next_X)
        y = y.append(next_y)
    return X, y

In [25]:
def make_new_data_sets(data):
    TEST = 2
    VALID = 2
    TRAIN = 9
    yearly_data = []
    for year in range(START_YEAR, END_YEAR + 1):
        X, y = make_yearly_data(data, year)
        yearly_data.append([X, y])
    assert TRAIN + VALID + TEST == len(yearly_data) - 1
    
    with_beetle_data = []
    for i in range(len(yearly_data) - 1):
        x1, y1 = yearly_data[i]
        x2, y2 = yearly_data[i + 1]
        assert list(x1.x) == list(x2.x)
        assert list(x1.y) == list(x2.y)
        x1['next_year_beetle'] = y2['beetle']
        with_beetle_data.append([x1, y1])
        
    test = with_beetle_data[:TEST]
    valid = with_beetle_data[TEST : TEST + VALID]
    train = with_beetle_data[TEST + VALID:]
    
    X_test, y_test = merge_sets(test)
    X_valid, y_valid = merge_sets(valid)
    X_train, y_train = merge_sets(train)
    
    return [[X_train, y_train], [X_valid, y_valid], [X_test, y_test]]

In [26]:
[[X_train, y_train], 
 [X_valid, y_valid], 
 [X_test, y_test]] = make_new_data_sets(data)

In [29]:
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(X_test.shape, y_test.shape)

(529623, 25) (529623, 1)
(117694, 25) (117694, 1)
(117694, 25) (117694, 1)


In [37]:
def save_xy(xy, path, suffix):
    X, y = xy
    X.to_csv(path + 'X_' + suffix + '.csv')
    y.to_csv(path + 'y_' + suffix + '.csv')

In [38]:
save_xy([X_train, y_train], DATA_DIR, 'train_full')
save_xy([X_valid, y_valid], DATA_DIR, 'valid_full')
save_xy([X_test, y_test], DATA_DIR, 'test_full')

In [39]:
os.listdir(DATA_DIR)

['tensor2000.pkl.bz2',
 'tensor2001.pkl.bz2',
 'tensor2002.pkl.bz2',
 'tensor2003.pkl.bz2',
 'tensor2004.pkl.bz2',
 'tensor2005.pkl.bz2',
 'tensor2006.pkl.bz2',
 'tensor2007.pkl.bz2',
 'tensor2008.pkl.bz2',
 'tensor2009.pkl.bz2',
 'tensor2010.pkl.bz2',
 'tensor2011.pkl.bz2',
 'tensor2012.pkl.bz2',
 'tensor2013.pkl.bz2',
 'tensor2014.pkl.bz2',
 'tensor20_2000.pkl.bz2',
 'tensor20_2001.pkl.bz2',
 'tensor20_2002.pkl.bz2',
 'tensor20_2003.pkl.bz2',
 'tensor20_2004.pkl.bz2',
 'tensor20_2005.pkl.bz2',
 'tensor20_2006.pkl.bz2',
 'tensor20_2007.pkl.bz2',
 'tensor20_2008.pkl.bz2',
 'tensor20_2009.pkl.bz2',
 'tensor20_2010.pkl.bz2',
 'tensor20_2011.pkl.bz2',
 'tensor20_2012.pkl.bz2',
 'tensor20_2013.pkl.bz2',
 'weights.bestNN.hdf5',
 'X_big_test.csv',
 'X_big_train.csv',
 'X_big_valid.csv',
 'X_test.csv',
 'X_test_full.csv',
 'X_train.csv',
 'X_train_full.csv',
 'X_valid.csv',
 'X_valid_full.csv',
 'y_big_test.csv',
 'y_big_train.csv',
 'y_big_valid.csv',
 'y_matrix2000.pkl.bz2',
 'y_matrix2001.