# Silver Datasets

## Importando as bibliotecas

In [17]:
import os
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import seaborn           as sns

from datetime import datetime

pd.set_option("display.max_columns", None)

## Definição de caminhos e diretórios

In [18]:
raw_path    = "data/raw"
bronze_path = "data/bronze"
silver_path = "data/silver"

## Helper Functions

## Criando os datasets em arquivos .csv

In [19]:
# Grava dataframe em arquivo .csv
def df_to_csv( df, path, mode ):
    with open( path, mode ) as csv_file:
        df.to_csv( csv_file, index = False )

## Calculando os indicadores

### SMA (Simple Moving Average)

In [20]:
# Simple Moving Average 
def SMA(data, ndays, _name): 
    SMA = pd.Series(data['Close'].rolling(ndays).mean(), name = _name) 
    data = data.join(SMA) 
    return data

### EWMA (Exponentially-weighted Moving Average)

In [21]:
# Calculando a EWMA
def EMA(data, col, ndays,_name): 
    EMA = pd.Series(data[col].ewm(span = ndays, min_periods = ndays - 1).mean(), name = _name) 
    data = data.join(EMA) 
    return data

### Bollinger Bands

In [22]:
# Calculando as bandas
def BBANDS(data, window):
    MA = data.Close.rolling(window).mean()
    SD = data.Close.rolling(window).std()
    data['UpperBand'] = MA + (2 * SD) 
    data['LowerBand'] = MA - (2 * SD)
    return data

### RSI (Relative Strength Index)

In [23]:
# Calculando o RSI
def rsi(close, periods = 14):
    
    close_delta = close.diff()

    # Make two series: one for lower closes and one for higher closes
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)
    
    ma_up = up.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
    ma_down = down.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()

    rsi = ma_up / ma_down
    rsi = 100 - (100/(1 + rsi))
    return rsi

### MFI (Money Flow Index)

In [24]:
def gain(x):
    return ((x > 0) * x).sum()


def loss(x):
    return ((x < 0) * x).sum()


# Calculate money flow index
def mfi(high, low, close, volume, n=14):
    typical_price = (high + low + close)/3
    money_flow = typical_price * volume
    mf_sign = np.where(typical_price > typical_price.shift(1), 1, -1)
    signed_mf = money_flow * mf_sign
    mf_avg_gain = signed_mf.rolling(n).apply(gain, raw=True)
    mf_avg_loss = signed_mf.rolling(n).apply(loss, raw=True)
    return (100 - (100 / (1 + (mf_avg_gain / abs(mf_avg_loss))))).to_numpy()


### ATR (Average True Range)

In [25]:
def atr(high, low, close, n=14):
    tr = np.amax(np.vstack(((high - low).to_numpy(), (abs(high - close)).to_numpy(), (abs(low - close)).to_numpy())).T, axis=1)
    return pd.Series(tr).rolling(n).mean().to_numpy()

### FI (Force Index)

In [26]:
def ForceIndex(data, ndays): 
    FI = pd.Series(data['Close'].diff(ndays) * data['Volume'], name = 'ForceIndex') 
    data = data.join(FI) 
    return data

### Ease of Movement

In [27]:
# Ease of Movement
def EMV(data, ndays): 
    dm = ((data['High'] + data['Low'])/2) - ((data['High'].shift(1) + data['Low'].shift(1))/2)
    br = (data['Volume'] / 100000000) / ((data['High'] - data['Low']))
    EMV = dm / br 
    EMV_MA = pd.Series(EMV.rolling(ndays).mean(), name = 'EMV') 
    data = data.join(EMV_MA) 
    return data 

## Carregando os dados para o dataframe df_silver

In [37]:
def silver_datasets_generation():  
    file_list = []
        
    if os.path.exists( bronze_path ):
        file_list = os.listdir( bronze_path )

        for file in file_list[0:1]:
            if '.csv' in file:
                print( f'Processando o arquivo {file}', os.path.getsize(f"{bronze_path}/{file}")/(1024**2), 'Mb')
                try:
                    # loadin the bronze dataset into df_silver
                    df_silver = pd.read_csv( f"{bronze_path}/{file}" )
                    
                    # sorting the df_silver by the open time
                    df_silver.sort_values( "Open_time" )
                                        
                    # converting Open_time and Close_time columns to datetime
                    df_silver[ 'Open_time' ] = df_silver[ 'Open_time' ].apply( lambda x: datetime.fromtimestamp( np.round( x/1000, 0 ) ) )
                    df_silver[ 'Close_time' ] = df_silver[ 'Close_time' ].apply( lambda x: datetime.fromtimestamp( np.round( x/1000, 0 ) ) )
                    
                    # Adding the mean price column
                    df_silver[ "Mean"] = df_silver[["Open","High","Low","Close"]].sum( axis = 1 ) / 4
                    df_silver = df_silver [["Open_time","Open","High","Low","Close","Mean","Volume","Close_time","Quote_asset_volume","Number_of_trades","Taker_buy_base_asset_volume","Taker_buy_quote_asset_volume"]]
                    
                    ########################################################################################################
                    # Including the Technical Indicators
                    ########################################################################################################

                    # Long term SMA (Simple Moving Average)
                    df_silver = SMA(df_silver, 50, 'LT_SMA')

                    # Short term SMA (Simple Moving Average)
                    df_silver = SMA(df_silver, 10, 'ST_SMA')

                    # Long term EWMA (Exponentially Moving Average)
                    df_silver = EMA(df_silver, 'Close', 26, 'LT_EMA')
                    
                    # Short term EWMA (Exponentially Moving Average)
                    df_silver = EMA(df_silver, 'Close', 12, 'ST_EMA')

                    # Subtract the 26-day EMA from the 12-Day EMA to get the MACD
                    df_silver["MACD"] = df_silver["LT_EMA"] - df_silver["ST_EMA"]  

                    # Get the 9-Day EMA of the MACD for the Trigger line
                    df_silver = EMA(df_silver, 'MACD', 9, 'Trigger_Line')                  

                    # Calculate the difference between the MACD - Trigger for the Convergence/Divergence value
                    df_silver["MADC_H"] = df_silver["MACD"] - df_silver["Trigger_Line"]

                    # Boiler bands
                    df_silver = BBANDS(df_silver, 50)

                    # RSI (Relative Strength Index)
                    df_silver["RSI"] = rsi(df_silver["Close"])

                    # MFI (Money Flow index)
                    df_silver["MFI"] = mfi(df_silver["High"], df_silver["Low"], df_silver["Close"], df_silver["Volume"], 14)

                    # ATR (Average True Range)
                    df_silver['ATR'] = atr(df_silver['High'], df_silver['Low'], df_silver['Close'], 14)

                    # Force Index
                    df_silver = ForceIndex(df_silver, 1)

                    # EMV (Ease Movement)
                    df_silver = EMV(df_silver, 14)

                    # Dropando as linhas vazias
                    df_silver.dropna( inplace = True )

                    # reset index 
                    df_silver.reset_index( inplace = True, drop = True )
                                                          

                    # # Grava dados no silver dataset
                    # try:
                    #     if os.path.exists( silver_path ):
                    #         # pass
                    #         df_to_csv( df_silver, f"{silver_path}/{file}", 'a' )
                        
                    #     else:
                    #         os.mkdir( silver_path )
                    #         df_to_csv( df_silver, f"{df_silver}/{file}", 'a' )

                    # except Exception as ex:
                    #     print( f"ERROR: {ex}" )
                    #     pass

                except Exception as ex:
                    print( f"ERROR: {ex}" )                
                
    return df_silver
    
df_silver = silver_datasets_generation()     
df_silver.head(100)

Processando o arquivo ETCUSDT-5m.csv 18.37930393218994 Mb


Unnamed: 0,Open_time,Open,High,Low,Close,Mean,Volume,Close_time,Quote_asset_volume,Number_of_trades,Taker_buy_base_asset_volume,Taker_buy_quote_asset_volume,LT_SMA,ST_SMA,LT_EMA,ST_EMA,MACD,Trigger_Line,MADC_H,UpperBand,LowerBand,RSI,MFI,ATR,ForceIndex,EMV
0,2021-07-01 01:05:00,54.898,55.868,54.884,55.503,55.28825,25583.025,2021-07-01 01:10:00,1.418277e+06,4598,14562.194,8.072054e+05,56.73894,55.3211,55.931792,55.439707,0.492085,0.463672,0.028413,58.744364,54.733516,42.667472,36.282857,0.491143,15503.313150,-157.765445
1,2021-07-01 01:10:00,55.512,55.568,55.278,55.375,55.43325,16363.444,2021-07-01 01:15:00,9.069693e+05,4001,8450.609,4.684709e+05,56.71156,55.2669,55.889717,55.429750,0.459967,0.462929,-0.002962,58.753747,54.669373,41.039831,29.798491,0.476143,-2094.520832,-145.400457
2,2021-07-01 01:15:00,55.376,56.083,55.325,55.865,55.66225,53125.379,2021-07-01 01:20:00,2.967874e+06,4999,30658.298,1.711572e+06,56.70368,55.2802,55.887852,55.496723,0.391130,0.448542,-0.057412,58.756010,54.651350,49.052154,41.179076,0.518214,26031.435710,-134.220162
3,2021-07-01 01:20:00,55.860,56.060,55.770,55.964,55.91350,10279.163,2021-07-01 01:25:00,5.747761e+05,3779,5462.055,3.054853e+05,56.69536,55.3198,55.893590,55.568622,0.324968,0.423789,-0.098820,58.756400,54.634320,50.515323,46.558928,0.485643,1017.637137,-48.177286
4,2021-07-01 01:25:00,55.964,56.000,55.534,55.730,55.80700,12893.999,2021-07-01 01:30:00,7.191106e+05,4425,5504.167,3.072108e+05,56.68058,55.3398,55.881279,55.593452,0.287827,0.396563,-0.108736,58.758773,54.602387,47.074077,42.870843,0.495643,-3017.195766,-74.537082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2021-07-01 09:00:00,55.000,55.088,54.422,54.629,54.78475,33519.362,2021-07-01 09:05:00,1.832364e+06,1801,20189.753,1.103151e+06,53.64264,54.7403,54.176884,54.549466,-0.372583,-0.317140,-0.055442,55.027383,52.257897,57.402381,62.201696,0.470714,-12435.683302,265.489681
96,2021-07-01 09:05:00,54.645,54.813,54.284,54.482,54.55600,11227.706,2021-07-01 09:10:00,6.114436e+05,802,4048.388,2.206234e+05,53.66228,54.7114,54.199485,54.539087,-0.339602,-0.321633,-0.017969,55.066484,52.258076,54.844446,61.578059,0.482000,-1650.472782,222.676394
97,2021-07-01 09:10:00,54.482,54.482,53.868,53.905,54.18425,17609.213,2021-07-01 09:15:00,9.521679e+05,1258,7640.735,4.131042e+05,53.66106,54.6354,54.177671,54.441535,-0.263864,-0.310079,0.046215,55.064293,52.257827,46.151135,55.176869,0.479571,-10160.515901,58.823815
98,2021-07-01 09:15:00,53.887,54.066,53.838,54.033,53.95600,11434.045,2021-07-01 09:20:00,6.167085e+05,916,5893.636,3.179072e+05,53.66330,54.5773,54.166955,54.378684,-0.211729,-0.290409,0.078680,55.068582,52.258018,48.115878,50.585150,0.474857,1463.557760,-2.164206


# EDA

### Relação entre preço de fefachemnto e indicadores

### Relação entre preço de fefachemnto e indicadores




### Relação entre preço médio e indicadores



### Relação entre inversão de comportamento do preço e os indicadores dos a dois. Ocorre cruzamento?



### Estimar quais indicadores melhor indicam inversão de comportamento.

# Featrue Engineering


### Diferença entre LT_SMA e ST_SMA 

### Diferença entre LT_EWMA e ST_EWMA

### sinal da inclinação para o preço e para todos os indicadores.

### Seno e Cosseno dos osciladores

#### gerar três diferentes grupos de dados:
* -5 (compra)   
* 0 (aguarda)   
* 5 (vende) 


# Treinamento do modelo   

### Três estratégias:   
1 - Classificar os dados em Compra, vende ou espera

2 - Prever se o preço sobe ou desce

3 - Prever o preço em análise de curto espaço de tempo.

Para os três casos rodar o Biruta e registrar experimentos no ML Flow.


### Estratégias 1: Classificadores e agrupadores



#### 1 - Kmeans 



#### 2 - DB Scan



#### 3 - KNearst Neighboor



#### 4 - Decision Tree



#### 5 - Random Forest



#### 6 - XGBoost Classifier



### Estratégia 2 Classificadores binários



#### Logístic Regressivo 



#### KNearst Neighboor



#### Decision Tree 



#### Random Forest



#### XGBoost Classifier



### Estratégia 3 Regressão.



#### Regressão linear



#### Regressão Linear Lasso ou Ride



#### Randon Forest



#### XGBosst Regressor

# Avaliação da Performance do modelo



# Finetuning (Hyper parâmetros)



### Random Search



### Ridige Search



### Zielzera Search




# Reavaliação dos modelos.


                          
# RELATÓRIO TOP.