Este notebook es donde se codifica la estrategia de trading 1 y se comprueba que fiabilidad tiene a la hora de obtener beneficios

In [1]:
import sys
import warnings
import pickle
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta
from multiprocessing import Pool

import pandas as pd
import numpy as np
import numba as nb
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

sns.set()
warnings.filterwarnings("ignore")

app_path = '/Users/esanc147/Documents/business/bsm03/web_app'
if app_path in sys.path:
    from tools.tags import create_tags
else: 
    sys.path.append('/Users/esanc147/Documents/business/bsm03/web_app')
    from tools.tags import create_tags

## Valores por defecto

In [2]:
COLUMNS_TECH = ['symbol', 'date', 'MACD_Signal', 'MACD_Hist', 'MACD', 'SlowK', 'SlowD',
       'Chaikin A/D', 'OBV', 'RSI21', 'ADX21',
       'CCI21', 'Aroon Up21', 'Aroon Down21',
       'RSI28', 'ADX28', 'CCI28', 'Aroon Down28', 'Aroon Up28',
       'Real Lower Band28', 'Real Upper Band28', 'Real Middle Band28',
       'SMA50', 'RSI50', 'ADX50', 'CCI50', 'Aroon Up50',
       'Aroon Down50']
COLUMNS = ['symbol', 'date', 'close', 'volume', 'open', 'high', 'low']
U_COLUMNS = ['close', 'volume', 'MACD_Signal', 'MACD_Hist', 'MACD', 'SlowK', 'SlowD',
             'Chaikin A/D', 'OBV', 'RSI21', 'ADX21', 'CCI21', 'Aroon Up21', 'Aroon Down21',
             'RSI28', 'ADX28', 'CCI28', 'Aroon Down28', 'Aroon Up28', 'Real Lower Band28',
             'Real Upper Band28', 'Real Middle Band28', 'SMA50', 'RSI50', 'ADX50', 'CCI50',
             'Aroon Up50', 'Aroon Down50']
TAG_COLUMNS = ['tag_7', 'tag_14', 'tag_21', 'tag_28']
FULL_PATH = "/Users/esanc147/Documents/business/bsm03/web_app/data"
SYMBOLS = [s.split('.csv')[0] for s in os.listdir(f"{FULL_PATH}/tech/") if '.L' not in s]
PERIOD = [7, 14, 21, 28]

## Carga de los datos

In [3]:
symbols = SYMBOLS

total_dataframes = list()

i = 1
for symbol in symbols:
    if (i % 1000) == 0:
        print("+1000 symbols loaded")
    path_close = f"{FULL_PATH}/close/{symbol}.csv"
    df_close = pd.read_csv(path_close, names=COLUMNS)
    df_close['date'] = pd.to_datetime(df_close['date'])
    df_close['volume'] = df_close['volume'].astype(float)

    path_tech = f"{FULL_PATH}/tech/{symbol}.csv"
    df_tech = pd.read_csv(path_tech, names=COLUMNS_TECH)
    df_tech['date'] = pd.to_datetime(df_tech['date'])

    list_df_tagged = []
    for period in PERIOD:
        df_aux = create_tags(df_close, period)
        df_aux[f"pct_change_{period}"] = df_aux[f"pct_change_{period}"].astype(float)
        df_aux[f"pct_change_{period}"] = df_aux[f"pct_change_{period}"].astype(float)
        list_df_tagged.append(df_aux)
    df_tagged = pd.concat(list_df_tagged, axis=1)
    df_tagged.dropna(inplace=True)

    df_close = df_close.set_index(['symbol', 'date'])
    df_tech = df_tech.set_index(['symbol', 'date'])
    dataframe = pd.concat([df_close, df_tech, df_tagged], join='inner', axis=1)
    total_dataframes.append(dataframe)
    i += 1
tot_dataframe = pd.concat(total_dataframes)

+1000 symbols loaded
+1000 symbols loaded
+1000 symbols loaded
+1000 symbols loaded
+1000 symbols loaded
+1000 symbols loaded


ValueError: cannot set a frame with no defined index and a scalar

In [5]:
tot_dataframe = pd.concat(total_dataframes)

In [6]:
dataframe_reset = tot_dataframe.reset_index()
dataframe = dataframe_reset[(dataframe_reset['date'].dt.year > 2019)] \
                .set_index(['symbol', 'date']) \
                .sort_values(by='date', ascending=False)
df = dataframe_reset[(dataframe_reset['date'].dt.year > 2019) &
                     (dataframe_reset['date'].dt.month <= 6)] \
                .set_index(['symbol', 'date']) \
                .sort_values(by='date', ascending=False)

## Creación de las medias de las características

In [9]:
list_df_agg = list()
for day in PERIOD:
    path_rf = f"{FULL_PATH}/model/rf_{day}.pkl"
    path_rs = f"{FULL_PATH}/model/robust_scaler_{day}.pkl"
#    path_lg = f"{FULL_PATH}/model/lgbm_{day}.pkl"
    path_pca = f"{FULL_PATH}/model/pca_{day}.pkl"
    
#    clf_lg = pickle.load(open(path_lg, 'rb'))
    clf_rf = pickle.load(open(path_rf, 'rb'))
    scl = pickle.load(open(path_rs, 'rb'))
    pca = pickle.load(open(path_pca, 'rb'))
    
    X = df[U_COLUMNS].values
    y = df[f"tag_{day}"].values
    y = np.where(y == 'outlier bear', 'strong bear', y)
    y = np.where(y == 'outlier bull', 'strong bull', y)
    X_scl = scl.transform(X)
    y_pred_rf = clf_rf.predict(X_scl)
#    y_pred_lg = clf_lg.predict(X_scl)
    X_pca = pca.transform(X_scl)
    df_aux = pd.DataFrame(X_pca, columns=[f'PC {elem}' for elem in range(1, 5)], index=df.index)
    df_aux['tag_y_rf'] = f'{day} - rf' + ' - ' + y_pred_rf + ' - ' + y
#    df_aux['tag_y_lg'] = f'{day} - lg' + ' - ' + y_pred_lg + ' - ' + y
    df_mean_rf = df_aux.groupby('tag_y_rf', as_index=False).median()
#    df_mean_lg = df_aux.groupby('tag_y_lg', as_index=False).median()
    df_mean_rf = df_mean_rf.rename(columns={'tag_y_rf': 'tag_y'})
#    df_mean_lg = df_mean_lg.rename(columns={'tag_y_lg': 'tag_y'})
    list_df_agg.append(df_mean_rf)
#    list_df_agg.append(df_mean_lg)
    

df_agg = pd.concat(list_df_agg, ignore_index=True)

In [10]:
df_agg.to_csv(f"{FULL_PATH}/group/2020_1.csv", index=False, header=False)

In [11]:
df_agg.columns

Index(['tag_y', 'PC 1', 'PC 2', 'PC 3', 'PC 4'], dtype='object')

In [12]:
df_agg['tag_y'].value_counts()

21 - rf - bull - bear           1
21 - rf - strong bear - bear    1
14 - rf - strong bear - bull    1
28 - rf - strong bull - bear    1
14 - rf - bull - strong bear    1
                               ..
28 - rf - bull - strong bear    1
14 - rf - keep - strong bull    1
14 - rf - bull - strong bull    1
7 - rf - bull - strong bull     1
21 - rf - bear - keep           1
Name: tag_y, Length: 100, dtype: int64

## Creación del set de datos para la estrategia

In [16]:
list_df_strategy = list()
for day in PERIOD:
    path_rf = f"{FULL_PATH}/model/rf_{day}.pkl"
    path_rs = f"{FULL_PATH}/model/robust_scaler_{day}.pkl"
#    path_lg = f"{FULL_PATH}/model/lgbm_{day}.pkl"
    path_pca = f"{FULL_PATH}/model/pca_{day}.pkl"
    
#    clf_lg = pickle.load(open(path_lg, 'rb'))
    clf_rf = pickle.load(open(path_rf, 'rb'))
    scl = pickle.load(open(path_rs, 'rb'))
    pca = pickle.load(open(path_pca, 'rb'))
    
    X = dataframe[U_COLUMNS].values
    y = dataframe[f"tag_{day}"].values
    y = np.where(y == 'outlier bear', 'strong bear', y)
    y = np.where(y == 'outlier bull', 'strong bull', y)
    X_scl = scl.transform(X)
    y_pred_rf = clf_rf.predict(X_scl)
#    y_pred_lg = clf_lg.predict(X_scl)
    X_pca = pca.transform(X_scl)
    
    df_aux = pd.DataFrame(X_pca, columns=[f'PC {elem}' for elem in range(1, 5)], index=dataframe.index)
    df_aux[f'rf_tag_y_{day}'] = y_pred_rf + ' - ' + y
#    df_aux[f'lg_tag_y_{day}'] = y_pred_lg + ' - ' + y
    list_df_strategy.append(df_aux.reset_index().sort_values(by='date', ascending=True))

In [18]:
dict_rf = dict()
dict_lg = dict()

for idx, day in enumerate(PERIOD):
    df = list_df_strategy[idx].reset_index(drop=True)
    df_rf = df[df[f'rf_tag_y_{day}'].str.startswith('strong')]
#    df_lg = df[df[f'lg_tag_y_{day}'].str.startswith('strong')]
    for idx_df in range(0, df.shape[0]):
        pred_rf = df.loc[idx_df][f'rf_tag_y_{day}'].split(' - ')[0]
        clas_rf = df.loc[idx_df][f'rf_tag_y_{day}'].split(' - ')[1]
#        pred_lg = df.loc[idx_df][f'lg_tag_y_{day}'].split(' - ')[0]
#        clas_lg = df.loc[idx_df][f'lg_tag_y_{day}'].split(' - ')[1]
        df_aux_agg_rf = df_agg[df_agg['tag_y'].str.startswith(f'{day} - rf - {pred_rf}')]
        if df_aux_agg_rf.shape[0] == 0:
            continue
#        df_aux_agg_lg = df_agg[df_agg['tag_y'].str.startswith(f'{day} - lg - {pred_lg}')]
        symbol = df.loc[idx_df]['symbol']
        date = df.loc[idx_df]['date']
        pc = df.loc[idx_df][['PC 1', 'PC 2', 'PC 3', 'PC 4']]
#        cos_sim_lg = cosine_similarity(pc.values.reshape(1, -1), df_aux_agg_lg.select_dtypes(float))
        cos_sim_rf = cosine_similarity(pc.values.reshape(1, -1), df_aux_agg_rf.select_dtypes(float))
        idx_max_rf = np.argmax(abs(cos_sim_rf))
#        idx_max_lg = np.argmax(abs(cos_sim_lg))
        norm_rf = df_aux_agg_rf.reset_index(drop=True).loc[idx_max_rf]['tag_y'].rsplit(' - ', 1)[-1]
#        norm_lg = df_aux_agg_lg.reset_index(drop=True).loc[idx_max_lg]['tag_y'].rsplit(' - ', 1)[-1]
        if pred_rf.startswith('strong'):
            if pred_rf == norm_rf:
                if day not in dict_rf.keys():
                    dict_rf[day] = list()
                    dict_rf[day].append({'symbol': symbol,
                                         'date': date,
                                         'count': 1,
                                         'success': 1 if pred_rf == clas_rf else 0,
                                         'pred': pred_rf,
                                         'class': clas_rf})
                else:
                    dict_rf[day].append({'symbol': symbol,
                                         'date': date,
                                         'count': 1,
                                         'success': 1 if pred_rf == clas_rf else 0,
                                         'pred': pred_rf,
                                         'class': clas_rf})

#        if pred_lg.startswith('strong'):
#            if pred_lg == norm_lg:
#                if day not in dict_lg.keys():
#                    dict_lg[day] = list()
#                    dict_lg[day].append({'symbol': symbol,
#                                         'date': date,
#                                         'count': 1,
#                                         'success': 1 if pred_lg == clas_lg else 0})
#                else:
#                    dict_lg[day].append({'symbol': symbol,
#                                         'date': date,
#                                         'count': 1,
#                                         'success': 1 if pred_lg == clas_lg else 0})


In [46]:
day = 21

In [47]:
(18281 + 3711) / (2527+3711+1583+17445+18281)

0.5050175672262154

In [54]:
pd.DataFrame(dict_rf[day]).groupby(['pred', 'class'], as_index=False).sum()

Unnamed: 0,pred,class,count,success
0,strong bear,bear,566,0
1,strong bear,bull,1211,0
2,strong bear,keep,426,0
3,strong bear,strong bear,2674,2674
4,strong bear,strong bull,1569,0
5,strong bull,bear,5416,0
6,strong bull,bull,6145,0
7,strong bull,keep,3557,0
8,strong bull,strong bear,16393,0
9,strong bull,strong bull,29281,29281


In [49]:
pd.DataFrame(dict_rf[day]).groupby(['pred'], as_index=False).sum()

Unnamed: 0,pred,count,success
0,strong bear,6446,2674
1,strong bull,60792,29281


In [60]:
(2674+29281+566+6145)/(60792+6446)

0.5750617210505964

In [58]:
(29281+6145)/(5416 + 6145 + 3557 + 16393 + 29281)

0.5827411501513357

In [50]:
round((pd.DataFrame(dict_rf[day])['success'].sum() / pd.DataFrame(dict_rf[day]).shape[0]) * 100, 2)

47.53

In [34]:
# round((pd.DataFrame(dict_lg[day])['success'].sum() / pd.DataFrame(dict_lg[day]).shape[0]) * 100, 2)

In [65]:
aux = pd.DataFrame(dict_rf[day])
aux_a = aux.groupby(by=[aux['date'].dt.month, 'pred', 'class'], as_index=False)[['count', 'success']].sum()
print(aux_a)
round((aux_a['success'] / aux_a['count']) * 100, 2)

           pred        class  count  success
0   strong bear         bear    213        0
1   strong bear         bull    234        0
2   strong bear         keep    102        0
3   strong bear  strong bear    744      744
4   strong bear  strong bull    391        0
5   strong bull         bear    357        0
6   strong bull         bull    273        0
7   strong bull         keep    187        0
8   strong bull  strong bear   1202        0
9   strong bull  strong bull    368      368
10  strong bear         bear     46        0
11  strong bear         bull     34        0
12  strong bear         keep     20        0
13  strong bear  strong bear   1424     1424
14  strong bear  strong bull     52        0
15  strong bull         bear    351        0
16  strong bull         bull    117        0
17  strong bull         keep    126        0
18  strong bull  strong bear   5341        0
19  strong bull  strong bull    189      189
20  strong bear         bear     71        0
21  strong

0       0.0
1       0.0
2       0.0
3     100.0
4       0.0
5       0.0
6       0.0
7       0.0
8       0.0
9     100.0
10      0.0
11      0.0
12      0.0
13    100.0
14      0.0
15      0.0
16      0.0
17      0.0
18      0.0
19    100.0
20      0.0
21      0.0
22      0.0
23    100.0
24      0.0
25      0.0
26      0.0
27      0.0
28      0.0
29    100.0
30      0.0
31      0.0
32      0.0
33    100.0
34      0.0
35      0.0
36      0.0
37      0.0
38      0.0
39    100.0
40      0.0
41      0.0
42      0.0
43    100.0
44      0.0
45      0.0
46      0.0
47      0.0
48      0.0
49    100.0
50      0.0
51      0.0
52      0.0
53    100.0
54      0.0
55      0.0
56      0.0
57      0.0
58      0.0
59    100.0
dtype: float64

In [66]:
aux = pd.DataFrame(dict_rf[day])
aux_a = aux.groupby(by=[aux['date'].dt.month], as_index=False)[['count', 'success']].sum()
print(aux_a)
round((aux_a['success'] / aux_a['count']) * 100, 2)

   count  success
0   4071     1112
1   7700     1613
2  40895    21826
3   9437     5106
4   5037     2290
5     98        8


0    27.32
1    20.95
2    53.37
3    54.11
4    45.46
5     8.16
dtype: float64

In [25]:
aux = pd.DataFrame(dict_lg[day])
aux_a = aux.groupby(by=aux['date'].dt.month)[['count', 'success']].sum()
print(aux_a)
round((aux_a['success'] / aux_a['count']) * 100, 2)

KeyError: 7