In [46]:
#from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from google.cloud import storage
import pandas as pd
import numpy as np
from io import StringIO

In [49]:
def split_data(dest_bucket, dest_file, split_time, preprocess=False):
    # Read in the data from the GCS bucket and format the data
    data_loc = "gs://{0}/{1}".format(dest_bucket, dest_file)
    data = pd.read_csv(data_loc, index_col=0)
    #data.index.rename('time', inplace=True)
    first_idx = data.index.values[0]

    # Split the data based on the split_time param
    data = data.sort_index()
    train_data = data.loc[first_idx:split_time]  # Note: this is 'inclusive' so the last data point in train data
    test_data = data.loc[split_time:]            # shows up as the first data point in the test data
                                                 # This shouldn't be a big deal for this dataset
    
    # Preprocess the data (if applicable)
    if preprocess:
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(train_data)
        X_test = scaler.transform(test_data)
    
    else:
        X_train = train_data.to_numpy()
        X_test = test_data.to_numpy()
        
    scaled_train_data = pd.DataFrame(X_train, columns=data.columns)
    scaled_test_data = pd.DataFrame(X_test, columns=data.columns)
    
    # Save the data splits off to GCS bucket
    train_f = StringIO()
    test_f = StringIO()
    
    scaled_train_data.to_csv(train_f)
    scaled_test_data.to_csv(test_f)
    
    train_f.seek(0)
    test_f.seek(0)
    
    train_dest_file = "train.csv"
    test_dest_file = "test.csv"
    
    client = storage.Client()
    client.get_bucket(dest_bucket).blob(train_dest_file).upload_from_file(train_f, content_type='text/csv')
    client.get_bucket(dest_bucket).blob(test_dest_file).upload_from_file(test_f, content_type='text/csv')
    
    # Return the location of the new data splits
    return (dest_bucket, train_dest_file, test_dest_file)
    

In [50]:
dest_bucket = "rrusson-kubeflow-test"
dest_file = "raw_data_v2.csv"
split_time = "2004-02-15 12:52:39"
preprocess = True

split_data(dest_bucket, dest_file, split_time, preprocess=preprocess)

('rrusson-kubeflow-test', 'train.csv', 'test.csv')

In [62]:
import time
print("file_{}".format(time.perf_counter()))

file_3735.047098667
