In [4]:
import numpy as np
import pandas as pd

In [39]:
def evaluate(actual, pred):
    """Computes the R value of the predictions given the actual values.
    
    Both actual and pred should be of the same length where each index
    corresponds to the actual and predicted value of the same row of data.
    
    Args:
        actual: A list of actual values.
        pred: A list of predicted values.
    
    Returns:
        R value of the predictions given the actual values.
    """
    assert len(actual) == len(pred)
    nume = np.sum(np.square(actual - pred))
    denom = np.sum(np.square(actual - np.mean(actual)))
    r_sq = 1 - nume / denom
    return np.sign(r_sq) * np.sqrt(np.abs(r_sq))

In [41]:
def train_model(train_df):
    """Returns a new model trained using the given data frame.

    Args:
        train_df: DataFrame object corresponding to training data.
    
    Returns:
        Model trained using the data frame.
    """
    return None

In [42]:
def run_infer(model, features, target):
    """Populates the target using the features and model and updates the
    model.
    
    Args:
        model: A model for prediction and then for updating with new data.
        features: DataFrame object corresponding to new data to predict
                  on, and then update the existing model.
        target: DataFrame object containing id and the y value to be updated
                with.
    """
    return target

In [40]:
# A cell to perform local E2E training and prediction that simulates
# the KaggleGym behavior.
def localE2E():
    """A function to simulate KaggleGym behavior of predicting per timestep.
    """
    # Here's an example of loading the CSV using Pandas's built-in HDF5 support:
    with pd.HDFStore("train.h5", "r") as train:
        # Note that the "train" dataframe is the only dataframe in the file
        df = train.get("train")
        train_df = df[:806298]
        valid_df = df[806298:]
    
    # NOTE: Training goes here.
    model = train_model(train_df)
    predictions = []

    curr_timestamp = 0
    curr_data = []
    curr_target = []
    for row in valid_df.itertuples():
        # If it is a new timestamp, predict.
        if row.timestamp != curr_timestamp:
            if curr_data:
                # NOTE: Inference and updating goes here.
                new_targets = run_infer(
                    model,
                    # Drop the last column (y).
                    pd.DataFrame(curr_data, columns=valid_df.columns[:-1]),
                    pd.DataFrame(curr_target, columns=['id', 'y']))
                predictions.extend(list(new_targets['y']))
            # Reset arrays.
            curr_data = []
            curr_target= []
            curr_timestamp = row.timestamp
        # Drop the first (index) and last (y) into features.
        curr_data.append(list(row)[1:-1])
        curr_target.append([row.id, 0])

    # Infer for the last timestamp.
    new_targets = run_infer(
        model,
        # Drop the last column (y).
        pd.DataFrame(curr_data, columns=valid_df.columns[:-1]),
        pd.DataFrame(curr_target, columns=['id', 'y']))

    predictions.extend(list(new_targets['y']))
    return evaluate(valid_df['y'], predictions)

print(localE2E()) 

-0.0083508942268


In [None]:
# Read data from Kagglegym
import kagglegym

def kaggleE2E():
    """Runs Kaggle training and predictions E2E.
    """
    # Create env.
    env = kagglegym.make()
    # Get first observation.
    observation = env.reset()
    # Get train data.
    train_df = observation.train
    model = train_model(train_df)

    # Predict using Kagglegym.
    while True:
        predictions = run_infer(model, observation.features, observation.target)
        observation, reward, done, info = env.step(predictions)
        # print('Reward: ', reward)
        if done:
            break
    print(info)

kaggleE2E()