In [49]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer, MaxAbsScaler, RobustScaler, StandardScaler

In [2]:
def evaluate(actual, pred):
    """Computes the R value of the predictions given the actual values.
    
    Both actual and pred should be of the same length where each index
    corresponds to the actual and predicted value of the same row of data.
    
    Args:
        actual: A list of actual values.
        pred: A list of predicted values.
    
    Returns:
        R value of the predictions given the actual values.
    """
    assert len(actual) == len(pred)
    nume = np.sum(np.square(actual - pred))
    denom = np.sum(np.square(actual - np.mean(actual)))
    r_sq = 1 - nume / denom
    return np.sign(r_sq) * np.sqrt(np.abs(r_sq))

In [36]:
class Model(object):
    """A generic Model interface to encapsulate modeling and data processing.
    
    Attributes:
        model_: A sklearn model with two function interfaces - fit, predict.
        imputer_: A sklearn preprocessing model to perform imputation for missing values.
        normalizer_: A sklearn preprocessing model to normalize feature values.
        columns_: List of column names corresponding to the features.
    """
    def __init__(self, model=None, imputer=None, normalizer=None):
        """Initialize the Model class with the given model.
        
        Args:
            model: A sklearn model.
            imputer: Transformer for missing values.
            normalizer: Normalizer for the feature values.
        """
        self.model_ = model
        self.imputer_ = imputer
        self.normalizer_ = normalizer
        
    def fit(self, df):
        """Fit the underlying model with the given data frame.
        
        Args:
            df: Dataframe object containing the underlying data. It has three
                special columns - id, y (label), timestamp.
        """
        # Drops the three special columns.
        X_train = df.drop(['id', 'y', 'timestamp'], axis=1)
        self.columns_ = X_train.columns
        
        # Imputes the missing value.
        if self.imputer_:
            X_train = self.imputer_.fit_transform(X_train)
        else:
            # Fill with 0 if there is no imputer.
            X_train = X_train.fillna(0)
        
        # Normalizes the feature values.
        if self.normalizer_:
            X_train = self.normalizer_.fit_transform(X_train)
        
        Y_train = df['y']
        self.model_.fit(X_train, Y_train)
    
    def predict(self, df):
        """Infers the predictions for the given df features.
        
        Args:
            df: Dataframe object containing the data features to be predicted. It has
                two special columns - id, timestamp.
        
        Returns:
            The predictions for each of the row in df, in the same order as given.
        """
        X_pred = df.drop(['id', 'timestamp'], axis=1)
        
        # Imputes the missing value.
        if self.imputer_:
            X_pred = self.imputer_.transform(X_pred)
        else:
            # Fill with 0 if there is no imputer.
            X_pred = X_pred.fillna(0)
        
        # Normalizes the feature values.
        if self.normalizer_:
            X_pred = self.normalizer_.transform(X_pred)
        
        return self.model_.predict(X_pred)
    
    def visualize(self):
        """Visualize the underlying learned model."""
        pass

In [37]:
class LinearRegressionModel(Model):
    def visualize(self):
        assert len(self.columns_) == len(self.model_.coef_), \
            "# columns: %d vs # coeff: %d" % (len(self.columns_), len(self.model_.coef_))
        print('Bias: ', self.model_.intercept_)
        for (ind, column) in enumerate(self.columns_):
            print(column, ': ', self.model_.coef_[ind])

In [54]:
def train_model(train_df):
    """Returns a new model trained using the given data frame.

    Args:
        train_df: DataFrame object corresponding to training data.
    
    Returns:
        Model trained using the data frame.
    """
    model = LinearRegressionModel(
        model=LinearRegression(),
        imputer=Imputer(strategy='mean'),
        normalizer=RobustScaler())
    model.fit(train_df)
    return model

In [6]:
def run_infer(model, features, target):
    """Populates the target using the features and model and updates the
    model.
    
    Args:
        model: A model for prediction and then for updating with new data.
        features: DataFrame object corresponding to new data to predict
                  on, and then update the existing model.
        target: DataFrame object containing id and the y value to be updated
                with.
    """
    # Verify that the features and targets are aligned.
    assert len(features) == len(target)
    assert (features['id'] == target['id']).all()

    # Set the target value to the one given.
    target['y'] = model.predict(features)
    return target

In [55]:
# A cell to perform local E2E training and prediction that simulates
# the KaggleGym behavior.
def localE2E(run_validation=True, visualize_model=False):
    """A function to simulate KaggleGym behavior of predicting per timestep.
    """
    # Here's an example of loading the CSV using Pandas's built-in HDF5 support:
    with pd.HDFStore("train.h5", "r") as train:
        # Note that the "train" dataframe is the only dataframe in the file
        df = train.get("train")
        train_df = df[:806298]
        valid_df = df[806298:]
    
    # NOTE: Training goes here.
    model = train_model(train_df)
    if visualize_model:
        model.visualize()
    
    if not run_validation:
        return 0
    
    # Validation goes here.
    predictions = []
    curr_timestamp = 0
    curr_data = []
    curr_target = []
    for row in valid_df.itertuples():
        # If it is a new timestamp, predict.
        if row.timestamp != curr_timestamp:
            if curr_data:
                # NOTE: Inference and updating goes here.
                new_targets = run_infer(
                    model,
                    # Drop the last column (y).
                    pd.DataFrame(curr_data, columns=valid_df.columns[:-1]),
                    pd.DataFrame(curr_target, columns=['id', 'y']))
                predictions.extend(list(new_targets['y']))
            # Reset arrays.
            curr_data = []
            curr_target= []
            curr_timestamp = row.timestamp
        # Drop the first (index) and last (y) into features.
        curr_data.append(list(row)[1:-1])
        curr_target.append([row.id, 0])

    # Infer for the last timestamp.
    new_targets = run_infer(
        model,
        # Drop the last column (y).
        pd.DataFrame(curr_data, columns=valid_df.columns[:-1]),
        pd.DataFrame(curr_target, columns=['id', 'y']))

    predictions.extend(list(new_targets['y']))
    return evaluate(valid_df['y'], predictions)

print(localE2E(run_validation=True, visualize_model=True)) 

Bias:  -3.84225154579e-05
derived_0 :  0.0151059797238
derived_1 :  -0.00395201371901
derived_2 :  0.00538675428021
derived_3 :  0.00514174909599
derived_4 :  0.0161932535441
fundamental_0 :  0.000501628003804
fundamental_1 :  18126.3066486
fundamental_2 :  -0.101077739328
fundamental_3 :  -0.000103815948183
fundamental_5 :  0.00736938062179
fundamental_6 :  0.000798820447922
fundamental_7 :  0.0115993382782
fundamental_8 :  0.00128767567396
fundamental_9 :  0.000604576797994
fundamental_10 :  0.00331272159282
fundamental_11 :  0.00480294412597
fundamental_12 :  0.00155168714328
fundamental_13 :  0.000965376688291
fundamental_14 :  -0.0311213536972
fundamental_15 :  -0.0335742640051
fundamental_16 :  0.0180404644843
fundamental_17 :  0.00529361001883
fundamental_18 :  -0.0235061447511
fundamental_19 :  0.000579897467811
fundamental_20 :  116.678032663
fundamental_21 :  0.000877126584698
fundamental_22 :  0.0477748186829
fundamental_23 :  -0.0543540821123
fundamental_24 :  2.87277117943

## LinearRegression Results
#### In general, LinearRegression does not perform very well, with only about -0.002833 for the best result

<pre>

|R value   | Model            | Normalization | Features | Use Id | Online | Missing |
|----------|------------------|---------------|----------|--------|--------|---------|
|-0.008351 | Always set to 0  |               |          |        |        |         |
|-0.005666 | Linear regression| None          | Original |   No   |   No   |    0    |
|-0.074787 | Linear regression| None          | Original |   No   |   No   |  mean   |
|-0.060731 | Linear regression| None          | Original |   No   |   No   | median  |
|-0.293456 | Linear regression| normalize=true| Original |   No   |   No   |    0    |
|-0.329960 | Linear regression| normalize=true| Original |   No   |   No   |  mean   |
|-0.002833 | Linear regression| RS            | Original |   No   |   No   |    0    |
|-0.005737 | Linear regression| RS            | Original |   No   |   No   |  mean   |
|-0.002833 | Linear regression| RS            | Original |   No   |   No   | median  |
|-0.345930 | Linear regression| SS            | Original |   No   |   No   |    0    |
|-0.329960 | Linear regression| SS            | Original |   No   |   No   |  mean   |
|-0.348907 | Linear regression| SS            | Original |   No   |   No   | median  |
|-0.345930 | Linear regression| MAS           | Original |   No   |   No   |    0    |
|-0.329960 | Linear regression| MAS           | Original |   No   |   No   |  mean   |
|-0.348907 | Linear regression| MAS           | Original |   No   |   No   | median  |

</pre>

Column descriptions:
- Normalization: Whether we perform normalization on the feature values.
  - RS: RobustScaler
  - SS: StandardScaler
  - MAS: MaxAbsScaler
- Features: The features used by model.
- Id: Whether we use the id to train a different model per row.
- Online: Whether we use the eval data to update the model param.
- Missing: What value do we replace with the missing value.

## Details

For Linear regression without any normalization, the coefficient for each feature is very low (around e^-20 - e^-30), probably because the features are not normalized and thus too high. It also seems like one of the weight (derived_3) is 0. Hypothesis is that derived_features is computed from fundamental features.

Trying a bunch of imputation for NaN and normalization techniques.

Normalization:
- Not doing any normalization actually perform relatively well, especially since it is a linear regression. Only RobustScaler performs better.
- normalize=True for LinearRegression model seems to perform a lot worse than not doing normalization (most likely it is a StandardScaler)
- RobustScaler performs relatively well.
- StandardScaler and MaxAbsScaler performs badly.

Imputation:
- Missing value = 0 is actually fairly good.
- Missing value = "most frequent" needs a really long time for training, not sure why.


In [None]:
# Read data from Kagglegym
import kagglegym

def kaggleE2E():
    """Runs Kaggle training and predictions E2E.
    """
    # Create env.
    env = kagglegym.make()
    # Get first observation.
    observation = env.reset()
    # Get train data.
    train_df = observation.train
    model = train_model(train_df)

    # Predict using Kagglegym.
    while True:
        predictions = run_infer(model, observation.features, observation.target)
        observation, reward, done, info = env.step(predictions)
        # print('Reward: ', reward)
        if done:
            break
    print(info)

kaggleE2E()