In [1]:
import pandas as pd
import numpy as np
import bokeh.plotting as bp
#bp.output_notebook()

In [138]:
#Se lee la señal anteriormente procesada
data_set = pd.read_csv("../data/TrueFX/EUR-USD/EURUSD-2017-07-until-2017-12.csv", parse_dates=True,
                      infer_datetime_format=True, index_col=0)

In [139]:
# %load ../code/build_dataset.py
def build_dataset(df, window, binary_target=False, delete_constant_values=True, PNL=False):
    """
    función para construir un data set
    window: tamaño de la ventana a utilizar para construir el dataset
    df: dataframe, con columna bid y ask.
    binary_target: si desea clasificar, este arroja 2 si el valor se mantiene,
    1 si el valor sube y 0 si este baja.
    delete_constant_values: default: True, elimina los valores que se mantienen
    
    retorna:
    X: dataset, con columna de PNL si así se especifica(default: False)
    y: target
    bt: binary target, default: False
    """

    import pandas as pd
    import numpy as np
    result = []
    binary = [] #para la columna objetivo binaria
    pnl_buy = [] #almacenar el pnl en caso de compra
    pnl_sell = [] #almacenar el pnl en caso de venta
    signal = df.bid
    ask = df.ask
    indx = signal.index[window-1:-1] #se toman los indicen que quedarán al final
    for i in range(len(signal)-window):
        
        if delete_constant_values == True:
            if signal[i+window] != signal[i+window-1]:

                result.append(signal[i: i + window+1])
                if PNL == True: 
                    pnl_sell.append(signal[i+window-1] - ask[i+window]) #calcular pnl en caso de venta-compra
                    pnl_buy.append(signal[i+window] - ask[i+window-1]) #calcular pnl en caso de compra-venta
                
                if binary_target == True:
                    if signal[i+window] < signal[i+window-1]: binary.append(0) # 0 si baja
                    if signal[i+window] > signal[i+window-1]: binary.append(1) # 1 si sube
                        
            else: indx = indx.delete(len(result))
                
        else:

            result.append(signal[i: i + window+1])
            if PNL == True:
                pnl_sell.append(signal[i+window-1] - ask[i+window]) #calcular pnl en caso de venta-compra
                pnl_buy.append(signal[i+window] - ask[i+window-1]) #calcular pnl en caso de compra-venta
            
        if binary_target == True and delete_constant_values == False:
            if signal[i+window] == signal[i+window-1]: binary.append(2) # 2 si se mantiene
            if signal[i+window] < signal[i+window-1]: binary.append(0) # 1 si baja
            if signal[i+window] > signal[i+window-1]: binary.append(1) # 0 si sube
    
    data = pd.DataFrame(np.array(result), index=indx)
    y = np.array(data.iloc[:,window])
    data = data.drop(window,axis=1)
    if PNL == True:
        data['PNL_0'] = pnl_sell
        data['PNL_1'] = pnl_buy 
    if binary_target == True: return data, y, np.array(binary)
    else: return data, y

In [140]:
data_set[:10]

Unnamed: 0_level_0,bid,ask
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-07-03 00:00:10,1.14183,1.14211
2017-07-03 00:00:20,1.14163,1.14211
2017-07-03 00:00:30,1.14164,1.14206
2017-07-03 00:00:50,1.14164,1.14206
2017-07-03 00:01:00,1.14164,1.14208
2017-07-03 00:01:10,1.14164,1.14206
2017-07-03 00:01:20,1.14163,1.14206
2017-07-03 00:01:30,1.14156,1.14206
2017-07-03 00:01:40,1.14164,1.14208
2017-07-03 00:01:50,1.14164,1.14206


In [141]:
window = 3
X,y_reg,bt= build_dataset(data_set,window=window, PNL=True,binary_target=True,delete_constant_values=True)

In [142]:
X.head()

Unnamed: 0_level_0,0,1,2,PNL_0,PNL_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-07-03 00:01:10,1.14164,1.14164,1.14164,-0.00042,-0.00043
2017-07-03 00:01:20,1.14164,1.14164,1.14163,-0.00043,-0.0005
2017-07-03 00:01:30,1.14164,1.14163,1.14156,-0.00052,-0.00042
2017-07-03 00:02:00,1.14164,1.14164,1.14164,-0.00042,-0.00051
2017-07-03 00:02:10,1.14164,1.14164,1.14156,-0.00052,-0.00042


In [143]:
# %load ../code/PNLEstimatorWrapper.py
import pandas as pd
import numpy as np
class PNLEstimatorWrapper:
    
    def __init__(self, estimator, PNL_column, exclude_PNL_column_from_training=True):
        self.estimator = estimator
        self.PNL_column = PNL_column
        self.exclude_PNL_column_from_training = exclude_PNL_column_from_training
        
    def fit(self, X, y):
        assert 'PNL_1' and 'PNL_0' in X.columns, "column "+self.PNL_column+" not in X dataframe"
        if self.exclude_PNL_column_from_training:
            X = X[[col for col in X.columns if col!=self.PNL_column[0] and col!=self.PNL_column[1]]]
        self.estimator.fit(X,y)
        
    def predict(self, X):
        assert 'PNL_1' and 'PNL_0' in X.columns, "column "+self.PNL_column+" not in X dataframe"
        if self.exclude_PNL_column_from_training:
            X = X[[col for col in X.columns if col!=self.PNL_column[0] and col!=self.PNL_column[1]]]
        return self.estimator.predict(X)
    
    def score(self, X, y):
        pnl_1 = X[self.PNL_column[0]]
        pnl_0 = X[self.PNL_column[1]]
        pre = self.predict(X)
        r = sum((pre==1)*pnl_1 + (pre==0)*pnl_0)
        sell = sum((pre==0)*1)
        buy = sum((pre==1)*1)
        
        count_ones, count_zeros = 0,0
        l_one, l_zero = [],[]
        for i in pre:
            if i==1:
                l_zero.append(count_zeros)
                count_zeros = 0
                count_ones += 1
            else:
                l_one.append(count_ones)
                count_zeros += 1
                count_ones = 0
        l_zero.append(count_zeros)
        l_one.append(count_ones)
        
        #return np.array([r,buy,sell,max(l_one),max(l_zero)])
        return np.array([r,buy,sell,max(l_one),max(l_zero)]), pre
        
    def get_params(self, deep=False):
        return {"PNL_column": self.PNL_column,
                "exclude_PNL_column_from_training": self.exclude_PNL_column_from_training,
                "estimator": self.estimator}

In [144]:
%run ../code/redim.py
%run ../code/step_validation.py
%run ../code/v_split.py

In [145]:
from sklearn.naive_bayes import GaussianNB
    
y = bt
n_bdtrain = 4
n_bdtest = 1
mday = 1
#para indexar el dataframe por las fechas
dates = np.unique(X.index.date)[n_bdtrain:]
est_GNB = PNLEstimatorWrapper(GaussianNB(), PNL_column=['PNL_0','PNL_1'])
result_GNB = step_validation(est_GNB, X, y, v_split(X,n_bdtrain,n_bdtest,mday))
result_GNB, predict_GNB = redim(result_GNB)
print('--------- GaussianNB ----------------')
print('average PNL : ', np.mean(result_GNB[:,0]))
print('average buy: ', np.mean(result_GNB[:,1]))
print('average sell: ', np.mean(result_GNB[:,2]))
print('-------------------------------------')
df_GNB = pd.DataFrame(result_GNB, columns=list(['PNL','buys','sells','longest_buys','longest_sells']),index=dates)
print(df_GNB)
#df_GNB.to_csv('../resultados/TrueFX/GBP-USD/GaussianNB/pnl_GNB_wsize3_2018-01.csv')

--------- GaussianNB ----------------
average PNL :  -0.275397460317
average buy:  1975.34920635
average sell:  4860.3015873
-------------------------------------
                PNL    buys   sells  longest_buys  longest_sells
2017-07-07 -0.23658     0.0  7013.0           0.0         7013.0
2017-07-10 -0.24576     0.0  6535.0           0.0         6535.0
2017-07-11 -0.20856     0.0  6687.0           0.0         6687.0
2017-07-12 -0.23936     0.0  6995.0           0.0         6995.0
2017-07-13 -0.22130  3035.0  3861.0        1685.0         2981.0
2017-07-14 -0.22834  2344.0  4376.0        1714.0         3450.0
2017-07-17 -0.31109     0.0  6631.0           0.0         6631.0
2017-07-18 -0.22088     0.0  7201.0           0.0         7201.0
2017-07-19 -0.21661     0.0  6745.0           0.0         6745.0
2017-07-20 -0.22630     0.0  7096.0           0.0         7096.0
2017-07-21 -0.23226     0.0  7054.0           0.0         7054.0
2017-07-24 -0.32235     0.0  6832.0           0.0        

In [146]:
from sklearn.neighbors import KNeighborsClassifier
    
y = bt
n_bdtrain = 4
n_bdtest = 1
mday = 1
#para indexar el dataframe por las fechas
dates = np.unique(X.index.date)[n_bdtrain:]
est_KNC = PNLEstimatorWrapper(KNeighborsClassifier(), PNL_column=['PNL_1','PNL_0'])
result_KNC = step_validation(est_KNC, X, y, v_split(X,n_bdtrain,n_bdtest,mday))
result_KNC, predict_KNC = redim(result_KNC)
print('--------- KNeighborsClassifier ------------')
print('average PNL : ', np.mean(result_KNC[:,0]))
print('average buy: ', np.mean(result_KNC[:,1]))
print('average sell: ', np.mean(result_KNC[:,2]))
print('-------------------------------------')
df_KNC = pd.DataFrame(result_KNC, columns=list(['PNL','buys','sells','longest_buys','longest_sells']),index=dates)
print(df_KNC)

--------- KNeighborsClassifier ------------
average PNL :  -0.274695079365
average buy:  3077.79365079
average sell:  3757.85714286
-------------------------------------
                PNL    buys   sells  longest_buys  longest_sells
2017-07-07 -0.22736  3351.0  3662.0          21.0           18.0
2017-07-10 -0.24303  3015.0  3520.0          19.0           20.0
2017-07-11 -0.21326  2517.0  4170.0          25.0         1294.0
2017-07-12 -0.24212  3802.0  3193.0         401.0          340.0
2017-07-13 -0.21190  3366.0  3530.0          20.0           48.0
2017-07-14 -0.23198  3219.0  3501.0          20.0           17.0
2017-07-17 -0.30994  3102.0  3529.0          23.0           28.0
2017-07-18 -0.23206   496.0  6705.0           9.0         6145.0
2017-07-19 -0.21132  3775.0  2970.0         173.0           30.0
2017-07-20 -0.24641  3573.0  3523.0        1327.0         2043.0
2017-07-21 -0.23485  1597.0  5457.0          18.0         2100.0
2017-07-24 -0.31508  3125.0  3707.0          15.0 

In [147]:
from sklearn.tree import DecisionTreeClassifier

est_DTC = PNLEstimatorWrapper(DecisionTreeClassifier(), PNL_column=['PNL_1','PNL_0'])
result_DTC = step_validation(est_DTC, X, y, v_split(X,4,1,1))
result_DTC, predict_DTC = redim(result_DTC)
print('------ DecisionTreeClassifier -------')
print('average PNL: ', np.mean(result_DTC[:,0]))
print('average buy: ', np.mean(result_DTC[:,1]))
print('average sell: ', np.mean(result_DTC[:,2]))
print('-------------------------------------')
df_DTC = pd.DataFrame(result_DTC, columns=list(['PNL','buys','sells','longest_buys','longest_sells']),index=dates)
print(df_DTC)

------ DecisionTreeClassifier -------
average PNL:  -0.276058571429
average buy:  3114.52380952
average sell:  3721.12698413
-------------------------------------
                PNL    buys   sells  longest_buys  longest_sells
2017-07-07 -0.22621  3343.0  3670.0          14.0           30.0
2017-07-10 -0.24035  2938.0  3597.0          14.0           24.0
2017-07-11 -0.21308  3740.0  2947.0        1141.0           50.0
2017-07-12 -0.23283  4135.0  2860.0         792.0           29.0
2017-07-13 -0.21347  3300.0  3596.0          18.0           22.0
2017-07-14 -0.23084  3079.0  3641.0          10.0           13.0
2017-07-17 -0.30418  3139.0  3492.0          12.0           12.0
2017-07-18 -0.23313   483.0  6718.0          10.0         6166.0
2017-07-19 -0.22288  3651.0  3094.0          94.0           33.0
2017-07-20 -0.24454  1754.0  5342.0          39.0         2039.0
2017-07-21 -0.23130  1996.0  5058.0          11.0         2010.0
2017-07-24 -0.32012  3354.0  3478.0          27.0        

In [148]:
from sklearn.ensemble import RandomForestClassifier

est_RFC = PNLEstimatorWrapper(RandomForestClassifier(n_estimators=15, n_jobs=3), PNL_column=['PNL_1','PNL_0'])
result_RFC = step_validation(est_RFC, X, y, v_split(X,4,1,1))
result_RFC, predict_RFC = redim(result_RFC)
print('------- RandomForestClassifier ----------')
print('PNL promedio: ', np.mean(result_RFC[:,0]))
print('buy promedio: ', np.mean(result_RFC[:,1]))
print('sell promedio: ', np.mean(result_RFC[:,2]))
print('-------------------------------------')
df_RFC = pd.DataFrame(result_RFC, columns=list(['PNL','buys','sells','longest_buys','longest_sells']),index=dates)
print(df_RFC)

------- RandomForestClassifier ----------
PNL promedio:  -0.275389365079
buy promedio:  3141.5952381
sell promedio:  3694.05555556
-------------------------------------
                PNL    buys   sells  longest_buys  longest_sells
2017-07-07 -0.22614  3260.0  3753.0          13.0           30.0
2017-07-10 -0.23979  2936.0  3599.0          13.0           24.0
2017-07-11 -0.21362  2511.0  4176.0          14.0         1408.0
2017-07-12 -0.23636  3812.0  3183.0         792.0          316.0
2017-07-13 -0.21425  3463.0  3433.0          19.0           23.0
2017-07-14 -0.23520  3215.0  3505.0          14.0           16.0
2017-07-17 -0.30229  3154.0  3477.0          15.0           12.0
2017-07-18 -0.23133   508.0  6693.0          13.0         6161.0
2017-07-19 -0.21690  3671.0  3074.0          94.0           33.0
2017-07-20 -0.24477  2682.0  4414.0         185.0         2043.0
2017-07-21 -0.22806  1672.0  5382.0          12.0         2031.0
2017-07-24 -0.32091  3244.0  3588.0          17.0  