In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.preprocessing import Imputer, RobustScaler

In [2]:
def evaluate(actual, pred):
    """Computes the R value of the predictions given the actual values.
    
    Both actual and pred should be of the same length where each index
    corresponds to the actual and predicted value of the same row of data.
    
    Args:
        actual: A list of actual values.
        pred: A list of predicted values.
    
    Returns:
        R value of the predictions given the actual values.
    """
    assert len(actual) == len(pred)
    nume = np.sum(np.square(actual - pred))
    denom = np.sum(np.square(actual - np.mean(actual)))
    r_sq = 1 - nume / denom
    return np.sign(r_sq) * np.sqrt(np.abs(r_sq))

In [3]:
class Model(object):
    """A generic Model interface to encapsulate modeling and data processing.
    
    Attributes:
        model_: A sklearn model with two function interfaces - fit, predict.
        imputer_: A sklearn preprocessing model to perform imputation for missing values.
        normalizer_: A sklearn preprocessing model to normalize feature values.
        columns_: List of column names corresponding to the features.
    """
    def __init__(self, model=None, imputer=None, normalizer=None):
        """Initialize the Model class with the given model.
        
        Args:
            model: A sklearn model.
            imputer: Transformer for missing values.
            normalizer: Normalizer for the feature values.
        """
        self.model_ = model
        self.imputer_ = imputer
        self.normalizer_ = normalizer
        
    def fit(self, df):
        """Fit the underlying model with the given data frame.
        
        Args:
            df: Dataframe object containing the underlying data. It has three
                special columns - id, y (label), timestamp.
        """
        # Drops the three special columns.
        X_train = df.drop(['id', 'y', 'timestamp'], axis=1)
        self.columns_ = X_train.columns
        
        # Imputes the missing value.
        if self.imputer_:
            X_train = self.imputer_.fit_transform(X_train)
        else:
            # Fill with 0 if there is no imputer.
            X_train = X_train.fillna(0)
        
        # Normalizes the feature values.
        if self.normalizer_:
            X_train = self.normalizer_.fit_transform(X_train)
        
        Y_train = df['y']
        self.model_.fit(X_train, Y_train)
    
    def predict(self, df):
        """Infers the predictions for the given df features.
        
        Args:
            df: Dataframe object containing the data features to be predicted. It has
                two special columns - id, timestamp.
        
        Returns:
            The predictions for each of the row in df, in the same order as given.
        """
        X_pred = df.drop(['id', 'timestamp'], axis=1)
        
        # Imputes the missing value.
        if self.imputer_:
            X_pred = self.imputer_.transform(X_pred)
        else:
            # Fill with 0 if there is no imputer.
            X_pred = X_pred.fillna(0)
        
        # Normalizes the feature values.
        if self.normalizer_:
            X_pred = self.normalizer_.transform(X_pred)
        
        return self.model_.predict(X_pred)
    
    def visualize(self):
        """Visualize the underlying learned model."""
        pass

In [4]:
class LinearModel(Model):
    """Wrapper around sklearn.linear_model.
    """
    def visualize(self):
        assert len(self.columns_) == len(self.model_.coef_), \
            "# columns: %d vs # coeff: %d" % (len(self.columns_), len(self.model_.coef_))
        print('Bias: ', self.model_.intercept_)
        for (ind, column) in enumerate(self.columns_):
            print(column, ': ', self.model_.coef_[ind])

In [5]:
def train_model(train_df):
    """Returns a new model trained using the given data frame.

    Args:
        train_df: DataFrame object corresponding to training data.
    
    Returns:
        Model trained using the data frame.
    """
    model = LinearModel(
        model=LinearRegression(),
        imputer=Imputer(strategy='median'),
        normalizer=RobustScaler())
    model.fit(train_df)
    return model

In [6]:
def run_infer(model, features, target):
    """Populates the target using the features and model and updates the
    model.
    
    Args:
        model: A model for prediction and then for updating with new data.
        features: DataFrame object corresponding to new data to predict
                  on, and then update the existing model.
        target: DataFrame object containing id and the y value to be updated
                with.
    """
    # Verify that the features and targets are aligned.
    assert len(features) == len(target)
    assert (features['id'] == target['id']).all()

    # Set the target value to the one given.
    target['y'] = model.predict(features)
    return target

In [7]:
# A cell to perform local E2E training and prediction that simulates
# the KaggleGym behavior.
def localE2E(run_validation=True, visualize_model=False):
    """A function to simulate KaggleGym behavior of predicting per timestep.
    """
    # Here's an example of loading the CSV using Pandas's built-in HDF5 support:
    with pd.HDFStore("train.h5", "r") as train:
        # Note that the "train" dataframe is the only dataframe in the file
        df = train.get("train")
        train_df = df[:806298]
        valid_df = df[806298:]
    
    # NOTE: Training goes here.
    model = train_model(train_df)
    if visualize_model:
        model.visualize()
    
    if not run_validation:
        return 0
    
    # Validation goes here.
    predictions = []
    curr_timestamp = 0
    curr_data = []
    curr_target = []
    for row in valid_df.itertuples():
        # If it is a new timestamp, predict.
        if row.timestamp != curr_timestamp:
            if curr_data:
                # NOTE: Inference and updating goes here.
                new_targets = run_infer(
                    model,
                    # Drop the last column (y).
                    pd.DataFrame(curr_data, columns=valid_df.columns[:-1]),
                    pd.DataFrame(curr_target, columns=['id', 'y']))
                predictions.extend(list(new_targets['y']))
            # Reset arrays.
            curr_data = []
            curr_target= []
            curr_timestamp = row.timestamp
        # Drop the first (index) and last (y) into features.
        curr_data.append(list(row)[1:-1])
        curr_target.append([row.id, 0])

    # Infer for the last timestamp.
    new_targets = run_infer(
        model,
        # Drop the last column (y).
        pd.DataFrame(curr_data, columns=valid_df.columns[:-1]),
        pd.DataFrame(curr_target, columns=['id', 'y']))

    predictions.extend(list(new_targets['y']))
    return evaluate(valid_df['y'], predictions)

print(localE2E(run_validation=True, visualize_model=True)) 

Bias:  0.00024932422384
derived_0 :  1.24728870099e-59
derived_1 :  0.0
derived_2 :  9.81129486803e-77
derived_3 :  -2.14049726543e-76
derived_4 :  -2.85539287609e-71
fundamental_0 :  4.94684807344e-77
fundamental_1 :  -6.47728069998e-67
fundamental_2 :  4.62102330091e-76
fundamental_3 :  2.50318867411e-77
fundamental_5 :  -3.3155452102e-77
fundamental_6 :  1.33367145396e-76
fundamental_7 :  2.41974940588e-74
fundamental_8 :  -3.41603944235e-78
fundamental_9 :  5.24742428328e-77
fundamental_10 :  -5.39733151988e-77
fundamental_11 :  8.09948403311e-76
fundamental_12 :  -1.91109606675e-74
fundamental_13 :  -8.92350385487e-77
fundamental_14 :  6.8232319373e-75
fundamental_15 :  -1.78399085607e-76
fundamental_16 :  1.02871276072e-76
fundamental_17 :  3.65881442347e-62
fundamental_18 :  6.0172080221e-76
fundamental_19 :  -3.82617813835e-77
fundamental_20 :  -8.57868114968e-76
fundamental_21 :  -5.28136739657e-77
fundamental_22 :  8.42783304228e-77
fundamental_23 :  1.20469180215e-74
fundame

In [None]:
# Read data from Kagglegym
import kagglegym

def kaggleE2E():
    """Runs Kaggle training and predictions E2E.
    """
    # Create env.
    env = kagglegym.make()
    # Get first observation.
    observation = env.reset()
    # Get train data.
    train_df = observation.train
    model = train_model(train_df)

    # Predict using Kagglegym.
    while True:
        predictions = run_infer(model, observation.features, observation.target)
        observation, reward, done, info = env.step(predictions)
        # print('Reward: ', reward)
        if done:
            break
    print(info)

kaggleE2E()