In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline  

with pd.HDFStore("C:\\Users\\4126694\\2sig-kaggle\\input\\train.h5", "r") as train:
    # Note that the "train" dataframe is the only dataframe in the file
    df = train.get("train")

In [None]:
means = df.drop(['y', 'timestamp'], axis=1).groupby('id').agg([np.std]).reset_index()

In [2]:
%matplotlib inline  

In [None]:
ids = df["id"].unique()
ids_in = {}
ts_lens ={}
for x in ids:
    time = df[df["id"] == x].timestamp
    if time.min() > 100 and time.max() < 1812:
        ids_in[x] = (time.min(), time.max())
        ts_lens[x] = time.max()-time.max()

for k, v in sorted(ids_in.items())[:10]:
    print("id {} in [{},{}]".format(k,v[0],v[1]))

In [None]:
instrument = 52
dfi = df[df["id"] == instrument]
plt.figure(figsize=(8,4))
plt.plot(dfi["timestamp"], dfi["y"], linestyle="none", marker=".")
plt.xlabel('timestamp')
plt.ylabel('returns')
_ = plt.title('returns for id {}'.format(instrument))

pd.set_option('mode.chained_assignment',None)
dfi.loc[:,"cumprod"] = (1+dfi["y"]).cumprod()
plt.figure(figsize=(8,4))
plt.plot(dfi["timestamp"], dfi["cumprod"], linestyle="none", marker=".")
plt.xlabel('timestamp')
plt.ylabel('value')
_ = plt.title('compound returns for id {}'.format(instrument))


In [None]:
cols = [x for x in dfi.columns.values if x not in ["id", "timestamp","y","cumprod"]]
l = len(cols)
f, ax = plt.subplots(int(l/3) + (1 if l%3 > 0 else 0), 3, figsize=(12,int(1.5*l)))
cnt = 0
for col in cols:
    fig = ax[int(cnt/3),cnt%3]
    fig.plot(dfi["timestamp"], dfi[col], linestyle="none", marker=".")
    fig.set_title("{} for id {}".format(col,instrument))
    fig.set_xlim([0,2000])
    fig.axvline(x=ids_in[instrument][0],color="r",linewidth=1)
    fig.axvline(x=ids_in[instrument][1],color="r",linewidth=1)
    cnt += 1

In [None]:
cols = [x for x in dfi.columns.values if x not in ["id", "timestamp","y","cumprod"]]
l = len(cols)
dfj = dfi.fillna(0)
target = dfj.pop('y')
ts = dfj.pop('timestamp')
dfj = dfi.drop(["id","y","cumprod"],axis=1)
dfj=dfj.fillna(0)
features = dfj.values


In [None]:
def train_test_split(data, test_size=0.1):  
    """
    This just splits data to training and testing parts
    """   
    df = pd.DataFrame(data)    
    ntrn = round(len(df) * (1 - test_size))
    ntrn = int(ntrn)
    tt = df.iloc[0:ntrn]
    vv = df.iloc[ntrn:]
    
    train = np.array(tt)
    val = np.array(vv)


    return (train, val)

(xtrain, xval) = train_test_split(features)
(ytrain, yval) = train_test_split(target) 
(tstrain, tsval) = train_test_split(ts) 

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

from sklearn.metrics import mean_squared_error

nest = 300
md = 10

rng = np.random.RandomState(1)
#regr_1 = DecisionTreeRegressor(max_depth=4)
#regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=md), n_estimators=nest, random_state=rng)
#regr_3 = GradientBoostingRegressor(n_estimators=nest, learning_rate=0.1, max_depth=md, random_state=rng, loss='ls')
regr_4 = MultiOutputRegressor(RandomForestRegressor(n_estimators=nest, max_depth=md, random_state=0))
regr_5 = RandomForestRegressor(n_estimators=nest, max_depth=md, random_state=rng)

#regr_1.fit(features, target)
#regr_2.fit(xtrain, ytrain)
#regr_3.fit(xtrain, ytrain)
regr_4.fit(xtrain, ytrain)
regr_5.fit(xtrain, ytrain)

#y_1 = regr_1.predict(features)
#y_2 = regr_2.predict(xval)
#y_3 = regr_3.predict(xval)
y_4 = regr_4.predict(xval)
y_5 = regr_5.predict(xval)

#mse2 = mean_squared_error(yval, y_2)
#mse3 = mean_squared_error(yval, y_3)
mse4 = mean_squared_error(yval, y_4)
mse5 = mean_squared_error(yval, y_5)

print("MSE4: %.6f  MSE5: %.6f" % (mse4,mse5))

plt.figure()
plt.figure(figsize=(15,10))
plt.plot(ts, target,c="k",label="training samples")
plt.plot(tsval, y_4, c="g", label="ADABoost500", linewidth=2)
plt.plot(tsval, y_5, c="r", label="GradBoost500", linewidth=2)


In [None]:
feature_importance = regr_2.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
#plt.subplot(1, 2, 2)
plt.figure(figsize=(15,15))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, dfj.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

In [None]:
# compute test set deviance
nest = 300
test_score = np.zeros((nest,), dtype=np.float64)

for i, y_pred in enumerate(regr_3.staged_predict(xval)):
    test_score[i] = regr_3.loss_(yval, y_3)

plt.figure(figsize=(12, 6))
#plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(nest) + 1, regr_3.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(nest) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from pandas.tseries.offsets import *
from keras import callbacks

remote = callbacks.RemoteMonitor(root='http://localhost:9000')

model = Sequential()
model.add(Dense(32, input_dim=l+1))
model.add(Activation('relu'))
model.add(Dense(10))
model.add(Activation('relu'))
model.add(Dense(1))  
Activation('linear')

model.compile(optimizer='adam', loss='mse')
model.fit(xtrain, ytrain, batch_size=256, nb_epoch=20, validation_split=0.2, callbacks=[remote])  


predicted = model.predict(xval) 
dataf =  pd.DataFrame(predicted[:1200])
dataf.columns = ["predict"]
dataf["input"] = yval[:1200]
dataf.plot(figsize=(15, 5))

#score = model.evaluate(X_test.as_matrix(), y_test, batch_size=16)
score = model.evaluate(xval, yval, batch_size=16)



In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation,TimeDistributedDense
from keras.layers import LSTM, Dropout

from pandas.tseries.offsets import *
from keras import callbacks

remote = callbacks.RemoteMonitor(root='http://localhost:9000')

length_of_sequences = 61
in_out_neurons = 1
hidden_neurons = 61
bs = 61

model = Sequential()  
model.add(LSTM(hidden_neurons, batch_input_shape=(None, length_of_sequences, in_out_neurons), return_sequences=False))
#model.add(LSTM(hidden_neurons, input_dim=length_of_sequences, return_sequences=True))
model.add(Dropout(0.2))
#model.add(TimeDistributedDense(length_of_sequences))
model.add(Dense(in_out_neurons))
model.add(Activation("linear"))  
model.compile(loss="mean_squared_error", optimizer="adam")

model.fit(xtrain, ytrain, batch_size=bs, nb_epoch=15, validation_split=0.2, callbacks=[remote])  
     
#model.fit(X_train, y_train, batch_size=bs, nb_epoch=15, validation_data=(X_test, y_test), callbacks=[remote])     
     
predicted = model.predict(xval) 
dataf =  pd.DataFrame(predicted[:1200])
dataf.columns = ["predict"]
dataf["input"] = yval[:1200]
dataf.plot(figsize=(15, 5))

#score = model.evaluate(X_test.as_matrix(), y_test, batch_size=16)
score = model.evaluate(xval, yval, batch_size=16)

In [None]:
ids

In [None]:
means = {}

for x in ids:
    ys = df[df["id"] == x].y
    ma = pd.ewma(ys,span=5)
    #means[i] = ys.mean()
    means[i] = ma[-1]

In [None]:
from scipy.stats import johnsonsu

a, b = 2.55439557416, 2.24822816797
mean, var, skew, kurt = johnsonsu.stats(a, b, moments='mvsk')


In [None]:
kurt

In [None]:
norm.stats()

In [None]:
means[0]

In [None]:
[means[d] for d in list]

In [None]:
import random
random.sample(means.values(),3)

In [None]:
randy = random.sample(means.values(),3)

In [None]:
randy

In [None]:
def rs(mu, sig, symlist):
    
    samp=[]
    
    for i in symlist:
        print i, mu[i], sig[i]
        s = np.random.normal(mu[i],sig[i])
        samp.append(s)
    return samp

In [None]:
means = {}
std = {}
for i in ids:
    ys = df[df["id"] == i].y
    #ma = pd.ewma(ys,span=5)
    means[i] = ys.mean()
    std[i] = ys.std()

In [None]:
means[0]

In [None]:
foo = rs(means,std,ids)

In [None]:
z = np.random.normal(means[1],std[1])

In [None]:
z

In [None]:
samp

In [None]:
samp.append(z)

In [None]:
z[0]

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

regression = linear_model.ElasticNetCV()
pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('regression', regression)])

cols = [x for x in df.columns.values if x not in ["id", "timestamp","y","cumprod"]]
l = len(cols)
dfj = dfi.fillna(0)
target = dfj.pop('y')
ts = dfj.pop('timestamp')
dfj = dfi.drop(["id","y","cumprod"],axis=1)
dfj=dfj.fillna(0)
features = dfj.values

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target