# IMPORTS

In [123]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

# 0. READING CSV + INFOS

In [219]:
def get_series(TICKER):
    '''
    input : a ticker 
    output : the corresponding dataframe
    '''    
    path = "/Users/edouardcuny/Desktop/quant/Carmela/data/" + TICKER + ".csv"
    df = pd.read_csv(path, index_col='Date', dtype={'Adj Close': np.float64}, na_values='null')
    df = df['Adj Close']
    df = df.rename(TICKER)
    # print str(TICKER) + ' : ' + str(len(df))
    return df

In [221]:
def print_info_stock(df):
    print 'min date : ' + str(min(df.index))
    print 'max date : ' + str(max(df.index))
    print 'nb dates : ' + str(len(df))
    print 'null     : ' + str(sum(df.isnull()))

In [220]:
GLE = get_series("GLE.PA")

In [222]:
print_info_stock(GLE)

min date : 2012-01-02
max date : 2017-12-29
nb dates : 1532
null     : 1


In [223]:
GLE[GLE.isnull()]
GLE = GLE[GLE.notnull()] # on décide d'enlever les lignes où ça déconnait (pas de valeur)

# 1. FEATURE ENGINEERING
obj = predict cumulative return in 5 trading days 

**FEATURES**
- ~~outside to inside BB~~
- ~~adjusted close/SMA~~
- ~~previous adjusted close/SMA~~
- ~~dérivée du adjusted close / SMA~~
- ~~crossed SMA up~~
- ~~crossed SMA down~~
- ~~momentum 1~~
- ~~momentum 5~~
- ~~momentum 10~~

## 1.1 BOLLINGER BANDS

In [244]:
def df_bollinger_features(stock, window_size):
    '''
    input = series, d'ajusted close d'un stock au cours du temps
    output = dataframe avec pour colonnes :
        - in_BB : bool qui vaut 1 si le spot est dans la bollinger band
        - pr_in_BB : bool qui vaut 1 si le spot était dans la bollinger band au précédent jour de trade
        - out_to_in_BB : bool qui vaut 1 si le spot vient de traverser la bollinger band
        - rolling mean : la moyenne roulante sur la taille window_size
    '''
    
    # BOLLINGER BANDS
    rolling_mean = stock.rolling(window=window_size).mean()
    rolling_std = stock.rolling(window=window_size).std()
    upper_bb = rolling_mean + 2*rolling_std
    lower_bb = rolling_mean - 2*rolling_std

    # plot pour vérification visuelle, décommenter pour voir un truc joli
    '''
    ax = stock[:100].plot()
    rolling_mean[:100].plot(ax=ax)
    upper_bb[:100].plot(ax=ax, color='c')
    lower_bb[:100].plot(ax=ax, color='c')
    plt.show()
    '''

    # inside BB
    in_BB = (stock < upper_bb) & (stock > lower_bb)
    in_BB[:window_size] = np.NaN
    
    # previous inside BB
    pr_in_BB = in_BB.shift(1) 

    # outside to inside BB
    out_to_in_BB = (pr_in_BB == 0) & (in_BB == 1)
    out_to_in_BB[:window_size+1] = np.NaN
    
    
    # Adjusted Close / SMA
    spike = stock/rolling_mean
    pr_spike = spike.shift(1)
    spike_derivative = spike - pr_spike
    crossed_RM_up = (pr_spike < 1) & (spike > 1)
    crossed_RM_down = (pr_spike > 1) & (spike < 1)
    crossed_RM_up[:window_size] = np.NaN
    crossed_RM_down[:window_size] = np.NaN
    
    # rename columns
    stock = stock.rename('Adj_Close')
    in_BB = in_BB.rename('in_BB')
    pr_in_BB = pr_in_BB.rename('pr_in_BB')
    rolling_mean = rolling_mean.rename('rolling_mean')
    out_to_in_BB = out_to_in_BB.rename('out_to_in_BB')
    spike = spike.rename('spike')
    pr_spike = pr_spike.rename('pr_spike')
    spike_derivative = spike_derivative.rename('spike_derivative')
    crossed_RM_up = crossed_RM_up.rename('crossed_RM_up')
    crossed_RM_down = crossed_RM_down.rename('crossed_RM_down')
    
    
    stock_df = pd.concat([in_BB,pr_in_BB,out_to_in_BB,rolling_mean,spike,pr_spike,spike_derivative,crossed_RM_up,crossed_RM_down], axis=1)
    return stock_df

## 1.2 MOMENTUM

In [259]:
def df_momentum(stock):
    '''
    input = series, d'ajusted close d'un stock au cours du temps
    output = dataframe avec pour colonnes :
        - mom_1 : le % de cumulative return sur le dernier jour de trading
        - mom_5 : le % de cumulative return sur les 5 derniers jours de trading
        - mom_10 : le % de cumulative return sur les 1à derniers jours de trading
    '''
    
    # compute momentums
    mom_1 = (stock/stock.shift(1)-1)*100
    mom_5 = (stock/stock.shift(5)-1)*100
    mom_10 = (stock/stock.shift(10)-1)*100

    # rename columns
    mom_1 = mom_1.rename('mom_1')
    mom_5 = mom_5.rename('mom_5')
    mom_10 = mom_10.rename('mom_10')
    
    return pd.concat([mom_1,mom_5,mom_10],axis=1)
    

## 1.3 Y
We want to predict the cumulative return 5 days from now.

In [268]:
def y(stock):
    '''
    input = series, d'ajusted close d'un stock au cours du temps
    output = series des returns dans 5 jours (ce qu'on cherche à prédire)
    '''
    
    y = (stock.shift(-5)/stock-1)*100
    y = y.rename('y')
    return y

In [None]:
GLE_df = pd.concat([GLE,df_bollinger_features(GLE,10),df_momentum(GLE),y(GLE)], axis=1)

In [270]:
GLE_df.tail(10)

Unnamed: 0_level_0,GLE.PA,in_BB,pr_in_BB,out_to_in_BB,rolling_mean,spike,pr_spike,spike_derivative,crossed_RM_up,crossed_RM_down,mom_1,mom_5,mom_10,y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2017-12-14,44.025002,1.0,1.0,0.0,43.396001,1.014494,1.030434,-0.01594,0.0,0.0,-1.156262,2.502913,4.053418,-0.51108
2017-12-15,43.195,1.0,1.0,0.0,43.519501,0.992544,1.014494,-0.021951,0.0,1.0,-1.885297,-1.639533,2.943282,0.601926
2017-12-18,43.32,1.0,1.0,0.0,43.556501,0.99457,0.992544,0.002027,0.0,0.0,0.289385,-1.422232,0.861464,-0.11542
2017-12-19,43.700001,1.0,1.0,0.0,43.653501,1.001065,0.99457,0.006495,1.0,0.0,0.877195,-1.863908,2.27007,-1.395883
2017-12-20,43.369999,1.0,1.0,0.0,43.749,0.991337,1.001065,-0.009728,0.0,1.0,-0.755153,-2.626857,2.251557,-0.737837
2017-12-21,43.799999,1.0,1.0,0.0,43.834,0.999224,0.991337,0.007887,0.0,0.0,0.991469,-0.51108,1.979041,
2017-12-22,43.455002,1.0,1.0,0.0,43.788,0.992395,0.999224,-0.006829,0.0,0.0,-0.787664,0.601926,-1.047476,
2017-12-27,43.27,1.0,1.0,0.0,43.7205,0.989696,0.992395,-0.002699,0.0,0.0,-0.425732,-0.11542,-1.536011,
2017-12-28,43.09,1.0,1.0,0.0,43.5765,0.988836,0.989696,-0.00086,0.0,0.0,-0.415993,-1.395883,-3.233773,
2017-12-29,43.049999,1.0,1.0,0.0,43.4275,0.991307,0.988836,0.002472,0.0,0.0,-0.092831,-0.737837,-3.345312,


# 2. ML

- ne garder que les bonnes colonnes
- normaliser les colonnes
- faire tourner un petit algo gentil
- dégager les NaN (qui ne sont autres que les premières lignes et dernières lignes à ce stade là)

## 2.1 PREPARING THE DF

In [285]:
df = GLE_df.copy()

# droping certain columns
df.drop([df.columns[0],'rolling_mean'],axis=1,inplace=True)

# removing NaN
df.dropna(inplace=True)

# splitting X and Y
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

# features rescaling
from sklearn.preprocessing import scale
X.loc[:,'spike'] = scale(X['spike'])
X.loc[:,'pr_spike'] = scale(X['pr_spike'])
X.loc[:,'spike_derivative'] = scale(X['spike_derivative'])
X.loc[:,'mom_1'] = scale(X['mom_1'])
X.loc[:,'mom_5'] = scale(X['mom_5'])
X.loc[:,'mom_10'] = scale(X['mom_10'])

# train & test
split = 0.7 
n = int(0.7*df.shape[0])
X_train = X.iloc[:n,:]
X_test = X.iloc[n:,:]
Y_train = Y[:n]
Y_test = Y[n:]

'''
print X_train.shape
print X_test.shape
print Y_train.shape
print Y_test.shape
'''

(1060, 11)
(455, 11)
(1060,)
(455,)


## 2.2 RUNNING THE ALGORITHM

In [287]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(verbose=10)
clf.fit(X_train,Y_train)
Y_pred = clf.predict(X_test)
comparison = pd.concat([pd.Series(Y_pred),Y_test],axis=1)
comparison.columns = ['pred','real']

building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapse

In [289]:
Y_pred

array([  3.62990044e-01,  -1.54169058e+00,   1.03113203e+00,
        -4.87425077e+00,  -1.08253333e-01,   3.21478679e+00,
        -1.30743183e+00,  -2.37448147e+00,   1.83303432e+00,
        -2.20474015e-01,   3.84686523e-01,  -4.75889363e-01,
         3.74358449e-02,  -4.24248376e+00,  -2.51092112e+00,
         2.34127553e+00,  -1.32577569e+00,  -2.00250791e-01,
        -4.21956111e-01,   1.64117883e+00,  -3.66931451e-01,
        -6.51064555e-01,   9.14518195e-01,   2.91800681e+00,
        -5.91331585e-01,   9.53219437e-01,   6.56858387e-01,
        -2.68672287e+00,  -2.81396249e-01,  -1.13026610e+00,
         1.35131460e+00,  -1.49131487e+00,   3.00424856e+00,
        -1.52675476e+00,   5.97954491e-01,   2.55097883e+00,
         1.99852749e+00,   5.43795082e-01,   8.10302405e-01,
         2.70336061e+00,   1.77988480e+00,   8.98753796e-01,
        -9.04061264e-01,   8.92615708e-01,   1.22700586e+00,
         3.84182946e-01,   2.52051753e+00,   4.25555627e+00,
        -7.00661894e-02,

# résultats

- plot du return vs expected return
- % tu temps où directionnel bon
- print de 50 résultats 