In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor


import kagglegym

%matplotlib inline

In [None]:
# Here's an example of loading the CSV using Pandas's built-in HDF5 support:
import pandas as pd

with pd.HDFStore("../input/train.h5", "r") as train:
    # Note that the "train" dataframe is the only dataframe in the file
    df = train.get("train")

In [None]:
# Create an environment
env = kagglegym.make()

# Get first observation
observation = env.reset()

# Get the train dataframe
train = observation.train

In [None]:
def getidtraindata(instrument):
    return train.loc[train.id==instrument,:]

train10 = getidtraindata(11)

In [None]:
def scale(values):
    new_values = []
    for value in values:
        new_value = (value - values.min())/(values.max()-values.min())
        new_values.append(new_value)
    return new_values

def scale_all_features(data):
    scaled_data = pd.DataFrame(data.timestamp)
    for col, old_values in data.iteritems():
        if col not in ['id','timestamp','y']:
            scaled_data[str(col)] = scale(old_values)
    return scaled_data

scaled_train10 = scale_all_features(train10)
scaled_train10.head()

In [None]:
plt.figure()
for col, values in scaled_train10.iteritems():
    if col not in ['id','timestamp','y']:
        plt.plot(scaled_train10.timestamp, values, '.')
plt.xlabel('timestamp')
plt.ylabel('scaled feature values')

In [None]:
def find_groups(features, data, limit):
    groups = []
    singles = []
    for col in features:
        group = []
        for feature in features:
            coeff = np.corrcoef(data[col].values, data[feature].values)[0,1]
            coeff = np.round(coeff, decimals=2)
            if coeff >= limit:
                group.append(feature)
        for member in group:
            while member in features:
                features.remove(member)
        if len(group) > 1:
            groups.append(group)
        elif len(group) == 1:
            singles.append(col)
    return groups, singles

In [None]:
features = [col for col in scaled_train10.columns if col not in ['id','timestamp','y']]
groups, singles = find_groups(features, scaled_train10, 0.90)
  

In [None]:
softgroups, residuals = find_groups(singles, scaled_train10, 0.80)
        

In [None]:
def show_group_dynamic(group, data):
    plt.figure()
    for member in group:
        values = data.loc[:, member]
        plt.plot(data.timestamp, values, '.')

In [None]:
show_group_dynamic(groups[0], scaled_train10)

In [None]:
groups[0]

In [None]:
def get_group_mean_values(group, data):
    member = group[0]
    table = pd.DataFrame(data[member].values, columns=[member])
    for index in range(1, len(group)):
        member = group[index]
        table[member] = pd.DataFrame(data[member].values, columns=[member])
    mean_values = table.mean(axis=1).values
    return mean_values 

In [None]:
def show_mean_feature_groups(groups, data, y_values):
    plt.figure()
    for index in range(0, len(groups)):
        mean_values = get_group_mean_values(groups[index], data)
        plt.plot(scaled_train10.timestamp, mean_values, '.-') 
    plt.plot(scaled_train10.timestamp, y_values)
        

In [None]:
y_scaled = scale(train10.y.values)
show_mean_feature_groups(groups, train10, train10.y.values)

In [None]:
def get_y_groups_correlations(y, groups, data):
    all_coeff = []
    for index in range(0, len(groups)):
        mean_values = get_group_mean_values(groups[index], data)
        coeff = np.corrcoef(y, mean_values)[0,1]
        all_coeff.append(coeff)
    return all_coeff

In [None]:
y = train10.y.values
corr_y_groups = get_y_groups_correlations(y, groups, train10)
corr_y_groups

In [None]:
groups

In [None]:
def prepare_features(groups, data):
    nr_timestamps = len(data.timestamp)
    X = np.zeros(shape=(nr_timestamps,len(groups)))
    for index in range(0, len(groups)):
        X[:, index] = get_group_mean_values(groups[index], data)
    return X

X = prepare_features(groups, train10)

In [None]:
net = MLPRegressor(hidden_layer_sizes=(50,), solver="lbfgs", activation="logistic") 

In [None]:
net.fit(X, y)

In [None]:
len(train10.timestamp.unique())


In [None]:
train.head()

In [None]:
df[ (df["id"]==11) & (df["timestamp"]==(len(scaled_train10.timestamp.unique()) + 1))]

In [None]:
def get_target_features(data, groups, instrument):
    X = []
    for group in groups:
        group_values = np.zeros(len(group))
        index = 0
        for col in group:
            group_values[index] = data.loc[data.id==instrument, col]
            index += 1
        group_values = np.array(group_values)
        X.append(group_values.mean())
    return np.array(X)
            
        
    

In [None]:
groups

In [None]:
x_p = get_target_features(observation.features, groups, 11)
x_p

In [None]:
x_p = x_p.reshape(1,-1)
y_p = net.predict(x_p)
y_p

In [None]:
perfect_y = df[ (df["id"]==11) & (df["timestamp"]==(len(scaled_train10.timestamp.unique()) + 1))].y
perfect_y