In [1]:
import pandas as pd
import numpy as np
import bokeh.plotting as bp
#bp.output_notebook()

In [2]:
#Se lee la señal anteriormente procesada
data_set = pd.read_csv("../data/TrueFX/GBP-USD/GBPUSD-preprocesado-2018-01.csv", parse_dates=True,
                      infer_datetime_format=True, index_col=0)

In [3]:
# %load ../code/build_dataset.py
def build_dataset(df, window, binary_target=False, delete_constant_values=True, PNL=False):
    """
    función para construir un data set
    window: tamaño de la ventana a utilizar para construir el dataset
    df: dataframe, con columna bid y ask.
    binary_target: si desea clasificar, este arroja 2 si el valor se mantiene,
    1 si el valor sube y 0 si este baja.
    delete_constant_values: default: True, elimina los valores que se mantienen
    
    retorna:
    X: dataset, con columna de PNL si así se especifica(default: False)
    y: target
    bt: binary target, default: False
    """

    import pandas as pd
    import numpy as np
    result = []
    binary = [] #para la columna objetivo binaria
    ask_col = [] #almacenar el ask
    signal = df.bid
    ask = df.ask
    indx = signal.index[window-1:-1] #se toman los indicen que quedarán al final
    for i in range(len(signal)-window):
        
        if delete_constant_values == True:
            if signal[i+window] != signal[i+window-1]:

                result.append(signal[i: i + window+1])
                if PNL == True: ask_col.append(ask[i+window-1])
                
                if binary_target == True:
                    if signal[i+window] < signal[i+window-1]: binary.append(0) # 0 si baja
                    if signal[i+window] > signal[i+window-1]: binary.append(1) # 1 si sube
                        
            else: indx = indx.delete(len(result))
        else:

            result.append(signal[i: i + window+1])
            if PNL == True: ask_col.append(ask[i+window-1])
            
        if binary_target == True and delete_constant_values == False:
            if signal[i+window] == signal[i+window-1]: binary.append(2) # 2 si se mantiene
            if signal[i+window] < signal[i+window-1]: binary.append(0) # 1 si baja
            if signal[i+window] > signal[i+window-1]: binary.append(1) # 0 si sube
    
    data = pd.DataFrame(np.array(result), index=indx)
    y = np.array(data.iloc[:,window])
    data = data.drop(window,axis=1)
    if PNL == True: data['ask'] = ask_col
    if binary_target == True: return data, y, np.array(binary)
    else: return data, y

In [4]:
data_set.head()

Unnamed: 0_level_0,bid,ask
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-02 00:00:40,1.34979,1.35176
2018-01-02 00:01:00,1.34918,1.35208
2018-01-02 00:01:10,1.34915,1.35354
2018-01-02 00:01:20,1.34915,1.35386
2018-01-02 00:01:30,1.34938,1.3544


In [5]:
window = 3
X,y_reg,bt= build_dataset(data_set,window=window, PNL=True,binary_target=True,delete_constant_values=True)

In [6]:
X.head()

Unnamed: 0_level_0,0,1,2,ask
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-02 00:01:20,1.34918,1.34915,1.34915,1.35386
2018-01-02 00:01:40,1.34915,1.34938,1.34938,1.35386
2018-01-02 00:01:50,1.34938,1.34938,1.34942,1.353
2018-01-02 00:02:10,1.34938,1.34942,1.34939,1.353
2018-01-02 00:02:40,1.34939,1.34979,1.34979,1.35153


In [13]:
# %load ../code/PNLEstimatorWrapper.py
import pandas as pd
import numpy as np
class PNLEstimatorWrapper:
    
    def __init__(self, estimator, PNL_column, exclude_PNL_column_from_training=True):
        self.estimator = estimator
        self.PNL_column = PNL_column
        self.exclude_PNL_column_from_training = exclude_PNL_column_from_training
        
    def fit(self, X, y):
        assert self.PNL_column in X.columns, "column "+self.PNL_column+" not in X dataframe"
        if self.exclude_PNL_column_from_training:
            X = X[[col for col in X.columns if col!=self.PNL_column]]
        self.estimator.fit(X,y)
        
    def predict(self, X):
        assert self.PNL_column in X.columns, "column "+self.PNL_column+" not in X dataframe"
        if self.exclude_PNL_column_from_training:
            X = X[[col for col in X.columns if col!=self.PNL_column]]
        return self.estimator.predict(X)
    
    def score(self, X, y):
        ask = X[self.PNL_column]
        pre = self.predict(X)
        r = sum((pre==1)*abs(ask)) - sum((pre==0)*(X.iloc[:,2]))
        sell = sum((pre==0)*1)
        buy = sum((pre==1)*1)
        
        count_ones, count_zeros = 0,0
        l_one, l_zero = [],[]
        for i in pre:
            if i==1:
                l_zero.append(count_zeros)
                count_zeros = 0
                count_ones += 1
            else:
                l_one.append(count_ones)
                count_zeros += 1
                count_ones = 0
        l_zero.append(count_zeros)
        l_one.append(count_ones)
        
        #return np.array([r,buy,sell,max(l_one),max(l_zero)])
        return np.array([r,buy,sell,max(l_one),max(l_zero)]), pre
        
    def get_params(self, deep=False):
        return {"PNL_column": self.PNL_column,
                "exclude_PNL_column_from_training": self.exclude_PNL_column_from_training,
                "estimator": self.estimator}

In [14]:
%run ../code/redim.py
%run ../code/step_validation.py
%run ../code/v_split.py

In [15]:
from sklearn.naive_bayes import GaussianNB
    
y = bt
n_bdtrain = 4
n_bdtest = 1
mday = 1
#para indexar el dataframe por las fechas
dates = np.unique(X.index.date)[n_bdtrain:]
est_GNB = PNLEstimatorWrapper(GaussianNB(), PNL_column='ask')
result_GNB = step_validation(est_GNB, X, y, v_split(X,n_bdtrain,n_bdtest,mday))
result_GNB, predict_GNB = redim(result_GNB)
print('--------- GaussianNB ----------------')
print('average PNL : ', np.mean(result_GNB[:,0]))
print('average buy: ', np.mean(result_GNB[:,1]))
print('average sell: ', np.mean(result_GNB[:,2]))
print('-------------------------------------')
df_GNB = pd.DataFrame(result_GNB, columns=list(['PNL','buys','sells','longest_buys','longest_sells']),index=dates)
print(df_GNB)
#df_GNB.to_csv('../resultados/TrueFX/GBP-USD/GaussianNB/pnl_GNB_wsize3_2018-01.csv')

--------- GaussianNB ----------------
average PNL :  -3793.98727053
average buy:  2137.52631579
average sell:  4865.57894737
-------------------------------------
                    PNL    buys   sells  longest_buys  longest_sells
2018-01-08  -7503.88987   633.0  6166.0         451.0         2635.0
2018-01-09    443.29014  3585.0  3248.0        2116.0         2825.0
2018-01-10    913.33529  3864.0  3183.0        1772.0         1066.0
2018-01-11   3209.15977  4759.0  2374.0        4759.0         2374.0
2018-01-12  -2861.48698  2545.0  4677.0        2450.0         4643.0
2018-01-15  10022.95726  7279.0     0.0        7279.0            0.0
2018-01-16  -9794.88299     0.0  7108.0           0.0         7108.0
2018-01-17 -10377.89010     0.0  7515.0           0.0         7515.0
2018-01-18 -10350.86476     0.0  7473.0           0.0         7473.0
2018-01-19 -10223.45825     0.0  7361.0           0.0         7361.0
2018-01-22 -10290.47006     0.0  7399.0           0.0         7399.0
2018-01-2

In [16]:
from sklearn.neighbors import KNeighborsClassifier
    
y = bt
n_bdtrain = 4
n_bdtest = 1
mday = 1
#para indexar el dataframe por las fechas
dates = np.unique(X.index.date)[n_bdtrain:]
est_KNC = PNLEstimatorWrapper(KNeighborsClassifier(), PNL_column='ask')
result_KNC = step_validation(est_KNC, X, y, v_split(X,n_bdtrain,n_bdtest,mday))
result_KNC, predict_KNC = redim(result_KNC)
print('--------- KNeighborsClassifier ------------')
print('average PNL : ', np.mean(result_KNC[:,0]))
print('average buy: ', np.mean(result_KNC[:,1]))
print('average sell: ', np.mean(result_KNC[:,2]))
print('-------------------------------------')
df_KNC = pd.DataFrame(result_KNC, columns=list(['PNL','buys','sells','longest_buys','longest_sells']),index=dates)
print(df_KNC)

--------- KNeighborsClassifier ------------
average PNL :  -2365.48942789
average buy:  2653.15789474
average sell:  4349.94736842
-------------------------------------
                   PNL    buys   sells  longest_buys  longest_sells
2018-01-08  -836.92393  3091.0  3708.0          18.0           26.0
2018-01-09  -651.49452  3176.0  3657.0          15.0           16.0
2018-01-10   424.89442  3681.0  3366.0         274.0           12.0
2018-01-11  -585.99185  3349.0  3784.0          48.0          214.0
2018-01-12 -6044.16609  1399.0  5823.0          14.0         4114.0
2018-01-15 -7945.25315   756.0  6523.0          12.0         4021.0
2018-01-16  -352.90183  3426.0  3682.0          22.0           37.0
2018-01-17 -3690.16439  2425.0  5090.0          12.0         1947.0
2018-01-18 -1079.13487  3347.0  4126.0          33.0           39.0
2018-01-19 -2588.77964  2750.0  4611.0          20.0          461.0
2018-01-22 -3257.32933  2532.0  4867.0          14.0         1334.0
2018-01-23 -271

In [18]:
from sklearn.tree import DecisionTreeClassifier

est_DTC = PNLEstimatorWrapper(DecisionTreeClassifier(), PNL_column='ask')
result_DTC = step_validation(est_DTC, X, y, v_split(X,4,1,1))
result_DTC, predict_DTC = redim(result_DTC)
print('------ DecisionTreeClassifier -------')
print('average PNL: ', np.mean(result_DTC[:,0]))
print('average buy: ', np.mean(result_DTC[:,1]))
print('average sell: ', np.mean(result_DTC[:,2]))
print('-------------------------------------')
df_DTC = pd.DataFrame(result_DTC, columns=list(['PNL','buys','sells','longest_buys','longest_sells']),index=dates)
print(df_DTC)

------ DecisionTreeClassifier -------
average PNL:  -1442.62770947
average buy:  2979.15789474
average sell:  4023.94736842
-------------------------------------
                   PNL    buys   sells  longest_buys  longest_sells
2018-01-08  -787.56461  3109.0  3690.0          11.0           17.0
2018-01-09  -519.35827  3225.0  3608.0          12.0           17.0
2018-01-10  -245.78321  3433.0  3614.0         274.0           12.0
2018-01-11 -1954.21510  2842.0  4291.0          16.0          453.0
2018-01-12 -5862.48447  1466.0  5756.0          12.0         4112.0
2018-01-15 -7505.62716   916.0  6363.0          13.0         4021.0
2018-01-16  -209.25725  3478.0  3630.0          13.0           22.0
2018-01-17 -3430.63843  2519.0  4996.0          11.0         1949.0
2018-01-18  -371.78828  3602.0  3871.0          37.0           34.0
2018-01-19 -1673.47027  3079.0  4282.0          15.0           95.0
2018-01-22 -3054.54116  2605.0  4794.0          13.0         1340.0
2018-01-23 -2214.39536

In [20]:
from sklearn.ensemble import RandomForestClassifier

est_RFC = PNLEstimatorWrapper(RandomForestClassifier(n_estimators=15, n_jobs=3), PNL_column='ask')
result_RFC = step_validation(est_RFC, X, y, v_split(X,4,1,1))
result_RFC, predict_RFC = redim(result_RFC)
print('------- RandomForestClassifier ----------')
print('PNL promedio: ', np.mean(result_RFC[:,0]))
print('buy promedio: ', np.mean(result_RFC[:,1]))
print('sell promedio: ', np.mean(result_RFC[:,2]))
print('-------------------------------------')
df_RFC = pd.DataFrame(result_RFC, columns=list(['PNL','buys','sells','longest_buys','longest_sells']),index=dates)
print(df_RFC)

------- RandomForestClassifier ----------
PNL promedio:  -2248.47560263
buy promedio:  2696.21052632
sell promedio:  4306.89473684
-------------------------------------
                   PNL    buys   sells  longest_buys  longest_sells
2018-01-08  -589.91250  3182.0  3617.0          16.0           14.0
2018-01-09  -752.05723  3139.0  3694.0          12.0           17.0
2018-01-10   221.97253  3606.0  3441.0         277.0           14.0
2018-01-11  1092.26489  3972.0  3161.0         449.0           24.0
2018-01-12 -5887.02097  1457.0  5765.0          10.0         4112.0
2018-01-15 -7961.74986   750.0  6529.0           9.0         4021.0
2018-01-16  -269.65335  3456.0  3652.0          19.0           25.0
2018-01-17 -3640.31636  2443.0  5072.0          12.0         1953.0
2018-01-18 -1392.74204  3234.0  4239.0          20.0           39.0
2018-01-19 -2012.86141  2957.0  4404.0          17.0          169.0
2018-01-22 -3304.80373  2515.0  4884.0          10.0         1334.0
2018-01-23 -272