In [None]:
# Ideas :
    # Do not delete data from end / begining of dataset because of missing data (ex : 30 days EMA only available at row 30). 
        # => Do it at the end if extrapolation is not applicable
    # Import old prices for crypto (even if it's only close_price to calcul indicators) ? => Avoid deleting data
    
    # When data importation OK, do data vizualisation to improve features choice / creation
    
    # Do interpolation / extrapolation and ensure that all data are coherents

In [None]:
# TODO : To end data processing phase
    # 1 / Price historical => Merge df + recalcul (on going - resample KO) : OK
    # 2 / Extrapolate missing data in both way (some columns could have beneficied from 1d data (price 24h, etc.))
    # 3 / Reformat code etc. => No TODO, clean, export functions in utils_csa
    # ============> Dataset Ok
    # 4 / TESTING PHASE :
        # 4.1 / Check describe on different columns and see if ok
        # 4.2 / Plot prices, volumes, indic, etc. All columns to chech if data are coherents

# Then : 
    # Machine learning : 
        # Data vizualisation, correlations, etc. 
        # What to look for, etc.

In [1]:
from utils_csa import show_model_accuracy, remove_outliers

import numpy as np
import pandas.io.sql as psql
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

from sqlalchemy import create_engine

from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

from pytz import timezone
from datetime import datetime, timedelta, date

from sklearn.model_selection import train_test_split

#import talib # https://github.com/mrjbq7/ta-lib    -    https://mrjbq7.github.io/ta-lib/
from talib.abstract import *

from scipy import stats

ohlcv_columns_to_be_cleaned = ['close_price', 'open_price', 'low_price', 'high_price']

In [37]:
def do_timestamp_tasks(df_ts):
    df_ts = df_ts[~df_ts.timestamp.duplicated(keep='first')]
    df_ts['timestamp'] = pd.to_datetime(df_ts.timestamp, utc=True)
    return df_ts.set_index('timestamp')

def get_dataset_ohlcv(connection, id_cryptocompare):  
    squery = "select oh.open_price, oh.high_price, oh.low_price, oh.close_price, oh.volume_aggregated as volume_aggregated_1h, oh.timestamp\n" #re.reddit_subscribers,
    squery += 'from histo_ohlcv oh\n'
    squery += 'where oh.id_cryptocompare = ' + id_cryptocompare + '\n'
    squery += 'order by oh.timestamp asc\n'
    return psql.read_sql_query(squery, connection)

def get_dataset_reddit(connection, id_cryptocompare):
    squery = "select re.reddit_subscribers, date_trunc('day', re.timestamp) + '00:00:00' as timestamp\n"
    squery += 'from social_stats_reddit_histo re\n'
    squery += 'where re.id_cryptocompare = ' + id_cryptocompare + '\n'
    squery += 'order by re.timestamp asc\n'
    return psql.read_sql_query(squery, connection)

def get_dataset_all_cryptos(connection):
    squery = 'select sum(hi.close_price * hi.volume_aggregated) as global_volume_usd_1h, sum(hi.close_price * pr.available_supply) as global_market_cap_usd, hi.timestamp\n'
    squery += 'from histo_ohlcv hi\n'
    squery += 'inner join coins co on (hi.id_cryptocompare = co.id_cryptocompare)\n'
    squery += 'left outer join prices pr on (pr.id_cryptocompare = hi.id_cryptocompare)\n'
    squery += 'group by timestamp\n'
    squery += 'order by timestamp'
    return psql.read_sql_query(squery, connection)

def get_dataset_google_trend(connection, id_cryptocompare, period):
    squery = 'select value_standalone, value_compared_to_standard, timestamp\n'
    squery += 'from social_google_trend' + period + '\n'
    squery += 'where id_cryptocompare = ' + id_cryptocompare + '\n'
    squery += 'order by timestamp'
    return psql.read_sql_query(squery, connection)

def get_dataset_ohlcv_old(connection, id_cryptocompare, before_date):
    squery = "select oh.open_price, oh.high_price, oh.low_price, oh.close_price, oh.volume_usd as volume_aggregated_1h, oh.timestamp\n"
    squery += 'from histo_ohlcv_old oh\n'
    squery += 'where oh.id_cryptocompare = ' + id_cryptocompare + '\n'
    squery += "and oh.timestamp < '" + str(before_date) + "'\n"
    squery += 'order by oh.timestamp desc\n'
    squery += 'limit 60\n'
    return psql.read_sql_query(squery, connection)

def get_ohlcv_1d_plus_missing_infos(df_ohlcv_p, id_cryptocompare):    
    df_ohlcv_old = get_dataset_ohlcv_old(connection, id_cryptocompare, df_ohlcv_p.index.min())
    df_ohlcv_old = clean_dataset_ohlcv_std(df_ohlcv_old, ohlcv_columns_to_be_cleaned + ['volume_aggregated_1h'], resample='1D')

    # resample to 1d
    df_ohlcv_1d = df_ohlcv_p.resample("1D").agg({'open_price': 'first', 'high_price': 'max', 'low_price': 'min', 
                                     'close_price': 'last', 'volume_aggregated_1h': 'sum'})
    
    df_ohlcv_old = df_ohlcv_old.resample("1D").agg({'open_price': 'first', 'high_price': 'max', 'low_price': 'min', 
                                     'close_price': 'last', 'volume_aggregated_1h': 'sum'})
    
    # quick & dirty way to have coherents volumes between both dataset
    mean_vol_old = df_ohlcv_old.tail(5).volume_aggregated_1h.mean()
    mean_vol_1d = df_ohlcv_1d.head(5).volume_aggregated_1h.mean()
    df_ohlcv_old.volume_aggregated_1h = df_ohlcv_old.volume_aggregated_1h / (mean_vol_old / mean_vol_1d)
    df_final = pd.concat([df_ohlcv_old, df_ohlcv_1d])
    
    df_final = df_final[~df_final.index.duplicated()]
    return df_final

def clean_dataset_google_trend(df_google_trend_p):
    df_google_trend_p = do_timestamp_tasks(df_google_trend_p)
    df_google_trend_p = df_google_trend_p.resample('1H').interpolate()
    df_google_trend_p['value_standalone'] = df_google_trend_p['value_standalone'].astype(int)
    df_google_trend_p['value_compared_to_standard'] = df_google_trend_p['value_compared_to_standard'].astype(int)

    return df_google_trend_p

def clean_dataset_ohlcv_spe(df_ohlcv_p):
    # drop rows with missing values (OHLCV)
    df_ohlcv_p = df_ohlcv_p.loc[(df_ohlcv_p.open_price != 0.0) & (df_ohlcv_p.high_price != 0.0) & (df_ohlcv_p.low_price != 0.0) & (df_ohlcv_p.close_price != 0.0) & (df_ohlcv_p.volume_aggregated_1h != 0.0)]

    # TODO : KO, pas uniquement close_price...
    return clean_dataset_ohlcv_std(df_ohlcv_p, ohlcv_columns_to_be_cleaned + ['volume_aggregated_1h'])

def clean_dataset_ohlcv_std(df_ohlcv_p, columns_name, do_ts_tasks=True, resample='1H'):
    # perform different tasks on df
    if do_ts_tasks:
        df_ohlcv_p = do_timestamp_tasks(df_ohlcv_p)
    df_ohlcv_p = remove_outliers(df_ohlcv_p, columns_name)
    
    df_ohlcv_p = df_ohlcv_p.resample(resample).interpolate()
    #print('shape after interpolate : ' + str(df_ohlcv.shape))
    
    return df_ohlcv_p

def feature_engineering_ohlcv(df_ohlcv_p):
    df_ohlcv_p = df_ohlcv_p.copy()
    
    # volume_aggregated_24h
    df_ohlcv_p['volume_aggregated_24h'] = df_ohlcv_p.volume_aggregated_1h.rolling(24).sum()
    
    # close price variance on different scales
    df_ohlcv_p['close_price_variance_3h'] = df_ohlcv_p.close_price.rolling(3).var()
    df_ohlcv_p['close_price_variance_12h'] = df_ohlcv_p.close_price.rolling(12).var()
    df_ohlcv_p['close_price_variance_24h'] = df_ohlcv_p.close_price.rolling(24).var()
    df_ohlcv_p['close_price_variance_7d'] = df_ohlcv_p.close_price.rolling(24 * 7).var()
    df_ohlcv_p['close_price_variance_15d'] = df_ohlcv_p.close_price.rolling(24 * 15).var()
    df_ohlcv_p['close_price_variance_30d'] = df_ohlcv_p.close_price.rolling(24 * 30).var()
    
    # variance high / low on period
    df_ohlcv_p['last_period_high_low_price_var_pct'] = abs(df_ohlcv_p['low_price'] - df_ohlcv_p['high_price']) / df_ohlcv_p['close_price']
    
    # volumes kpis 1h, 3h, 6h, 12h, 24h, 3d, 7d, 15d
    df_ohlcv_p['mean_volume_1h_30d'] = df_ohlcv_p.volume_aggregated_1h / df_ohlcv_p.volume_aggregated_1h.rolling(30 * 24).mean()
    arr_nums = [3, 6, 12, 24, 3 * 24, 7 * 24, 15 * 24]
    arr_labels = ['3h', '6h', '12h', '24h', '3d', '7d', '15d']
    for i in range(len(arr_nums)):
        df_ohlcv_p['mean_volume_' + arr_labels[i] + '_30d'] = df_ohlcv_p.volume_aggregated_1h.rolling(arr_nums[i]).mean() / df_ohlcv_p.volume_aggregated_1h.rolling(30 * 24).mean()
    
    # change vs n days low / n days high - pct_change for periods : 3d, 7d, 15d, 30d
    arr_nums = np.array([3, 7, 15, 30], dtype=int) * 24
    arr_labels = ['3d', '7d', '15d', '30d']
    
    # lows
    for i in range(len(arr_nums)):
        df_ohlcv_p['close_price_pct_change_vs_' + arr_labels[i] + '_low'] = (df_ohlcv_p.close_price - df_ohlcv_p.close_price.rolling(arr_nums[i]).min()) / df_ohlcv_p.close_price.rolling(arr_nums[i]).min()      
        
    # highs
    for i in range(len(arr_nums)):
        df_ohlcv_p['close_price_pct_change_vs_' + arr_labels[i] + '_high'] = (df_ohlcv_p.close_price - df_ohlcv_p.close_price.rolling(arr_nums[i]).max()) / df_ohlcv_p.close_price.rolling(arr_nums[i]).max()      
   
    return df_ohlcv_p

def feature_engineering_ohlcv_all_cryptos(df_ohlcv_all_p):
    # volume_aggregated_24h
    df_ohlcv_all_p['global_volume_usd_24h'] = df_ohlcv_all_p.global_volume_usd_1h.rolling(24).sum()
    
    return df_ohlcv_all_p

def feature_engineering_reddit(df_reddit_p):    
    # pct_change for periods : 1d, 3d, 7d, 15d, 30d
    arr_nums = np.array([1, 3, 7, 15, 30], dtype=int) * 24
    arr_labels = ['1d', '3d', '7d', '15d', '30d']
    for i in range(len(arr_nums)):
        df_reddit['reddit_subscribers_pct_change_' + arr_labels[i]] = df_reddit.reddit_subscribers.pct_change(periods=arr_nums[i])
    
    return df_reddit_p

def feature_engineering_google_trend(df_google_trend_p, period):
    # period = month
    arr_nums = np.array([1, 3, 7, 15, 30], dtype=int) * 24
    arr_labels = ['1d', '3d', '7d', '15d', '30d']
    
    #period = year
    if period == 'y':
        # pct_change for periods : 2m, 3m, 6m, 1y
        arr_nums = np.array([2, 3, 6, 12], dtype=int) * 24 * 30
        arr_labels = ['2m', '3m', '6m', '1y']   
    
    for i in range(len(arr_nums)):
        df_google_trend_p['gg_trend_value_standalone_pct_change_' + arr_labels[i]] = df_google_trend_p.value_standalone.pct_change(periods=arr_nums[i])
        df_google_trend_p['gg_trend_value_compared_pct_change_' + arr_labels[i]] = df_google_trend_p.value_compared_to_standard.pct_change(periods=arr_nums[i])
    return df_google_trend_p

def feature_engineering_technical_analysis(df_ohlcv_p, df_ohlcv_1d_p):
    df_ohlcv_tmp = df_ohlcv_p
    df_ohlcv_1d = df_ohlcv_1d_p
    
    # ========== INDICATORS CALCUL ==========

    # [Overlap Studies] EMA 30 days
    df_ohlcv_1d['Indic_EMA_30d'] = EMA(df_ohlcv_1d, price='close_price', timeperiod=30)    
    
    # [Overlap Studies] EMA 15 days
    df_ohlcv_1d['Indic_EMA_15d'] = EMA(df_ohlcv_1d, price='close_price', timeperiod=15)
    
    # [Overlap Studies] EMA 7 days
    df_ohlcv_1d['Indic_EMA_7d'] = EMA(df_ohlcv_1d, price='close_price', timeperiod=7)
    
    
    
    # [Overlap Studies] MA 30 days
    df_ohlcv_1d['Indic_MA_30d'] = MA(df_ohlcv_1d, price='close_price', timeperiod=30, matype=0)  
    
    # [Overlap Studies] MA 15 days
    df_ohlcv_1d['Indic_MA_15d'] = MA(df_ohlcv_1d, price='close_price', timeperiod=15, matype=0)
    
    # [Overlap Studies] MA 7 days
    df_ohlcv_1d['Indic_MA_7d'] = MA(df_ohlcv_1d, price='close_price', timeperiod=7, matype=0)
    
    

    # [Overlap Studies] BBands - TODO : 20 days ?
    bands = BBANDS(df_ohlcv_1d, price='close_price', timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
    bands.columns = ['Indic_Bbands_20d_upperband', 'Indic_Bbands_20d_middleband', 'Indic_Bbands_20d_lowerband']
    df_ohlcv_1d = df_ohlcv_1d.join(bands)
    

    # [Momentum Indicator] RSI 14 days
    df_ohlcv_1d['Indic_RSI_14d'] = RSI(df_ohlcv_1d, price='close_price', timeperiod=14)  
    
    

    # [Momentum Indicators] STOCH
    # ta-lib abstract API KO with dataframe : use workaround
    dataset = {'high': df_ohlcv_1d.high_price.values, 'low': df_ohlcv_1d.low_price.values, 'close': df_ohlcv_1d.close_price.values}
    kd = STOCH(dataset, fastk_period=14, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
    df_ohlcv_1d['Indic_Stoch_14_3_3_k'] = kd[0]
    df_ohlcv_1d['Indic_Stoch_14_3_3_d'] = kd[1]
    
    

    # [Momentum Indicators] MACD
    macd = MACD(df_ohlcv_1d, price='close_price', fastperiod=12, slowperiod=26, signalperiod=9)
    macd.columns = ['Indic_Macd_12_26_9_macd', 'Indic_Macd_12_26_9_macdsignal', 'Indic_Macd_12_26_9_macdhist']
    df_ohlcv_1d = df_ohlcv_1d.join(macd)
    
    # TODO / Ideas : ?
    
    

    # [Volume Indicators] OBV
    dataset = {'close': df_ohlcv_1d.close_price.values, 'volume': df_ohlcv_1d.volume_aggregated_1h.values}
    obv = OBV(dataset)
    df_ohlcv_1d['Indic_OBV'] = obv
    
        

    # join to have all indicators in one df
    df_ohlcv_1d = df_ohlcv_1d.drop(['open_price', 'high_price', 'low_price', 'close_price', 'volume_aggregated_1h'], axis=1)
    # TODO : Check if auto interpolate is ok for each indicator - not sure....
    df_ohlcv_tmp = df_ohlcv_tmp.join(df_ohlcv_1d.resample('1H').interpolate())
    
    
    # ========== ADD FEATURES FOR INTERPRETATION ==========
    
    # [Interpretation] EMA 30 days in uptrend : True / downtrend : False
    df_ohlcv_tmp['Indic_EMA_30d_uptrend'] = df_ohlcv_tmp.Indic_EMA_30d.pct_change(periods=1) > 0
    
    # [Interpretation] EMA 15 days in uptrend : True / downtrend : False
    df_ohlcv_tmp['Indic_EMA_15d_uptrend'] = df_ohlcv_tmp.Indic_EMA_15d.pct_change(periods=1) > 0
    
    # [Interpretation] EMA 7 days in uptrend : True / downtrend : False
    df_ohlcv_tmp['Indic_EMA_7d_uptrend'] = df_ohlcv_tmp.Indic_EMA_7d.pct_change(periods=1) > 0
    
    
    
    # [Interpretation] MA 30 days in uptrend : True / downtrend : False
    df_ohlcv_tmp['Indic_MA_30d_uptrend'] = df_ohlcv_tmp.Indic_MA_30d.pct_change(periods=1) > 0
    
    # [Interpretation] MA 15 days in uptrend : True / downtrend : False
    df_ohlcv_tmp['Indic_MA_15d_uptrend'] = df_ohlcv_tmp.Indic_MA_15d.pct_change(periods=1) > 0
    
    # [Interpretation] MA 7 days in uptrend : True / downtrend : False
    df_ohlcv_tmp['Indic_MA_7d_uptrend'] = df_ohlcv_tmp.Indic_MA_7d.pct_change(periods=1) > 0
    
    
    
    # [Interpretation] BBands close_price - Indic_Bbands_20d_upperband
    df_ohlcv_tmp['Indic_Bbands_20d_diff_close_upperband'] = df_ohlcv_tmp.close_price - df_ohlcv_tmp.Indic_Bbands_20d_upperband
    
    # [Interpretation] BBands close_price - Indic_Bbands_20d_middleband
    df_ohlcv_tmp['Indic_Bbands_20d_diff_close_upperband'] = df_ohlcv_tmp.close_price - df_ohlcv_tmp.Indic_Bbands_20d_middleband
    
    # [Interpretation] BBands close_price - Indic_Bbands_20d_middleband
    df_ohlcv_tmp['Indic_Bbands_20d_diff_close_lowerband'] = df_ohlcv_tmp.close_price - df_ohlcv_tmp.Indic_Bbands_20d_lowerband
    
    # TODO / Ideas : Boolean > 0 < 0 ?
    
    
    # [Interpretation] RSI 14 days in uptrend : True / downtrend : False
    df_ohlcv_tmp['Indic_RSI_14d_uptrend'] = df_ohlcv_tmp.Indic_RSI_14d.pct_change(periods=1) > 0
    
    # [Interpretation] RSI 14 days > value 70
    df_ohlcv_tmp['Indic_RSI_sup_70'] = df_ohlcv_tmp.Indic_RSI_14d > 70
    
    # [Interpretation] RSI 14 days < value 30
    df_ohlcv_tmp['Indic_RSI_inf_30'] = df_ohlcv_tmp.Indic_RSI_14d < 30
    
    
        
    # [Interpretation] STOCH > value 80
    df_ohlcv_tmp['Indic_Stoch_14_3_3_sup_80'] = (df_ohlcv_tmp.Indic_Stoch_14_3_3_k > 80) & (df_ohlcv_tmp.Indic_Stoch_14_3_3_d > 80)
    
    # [Interpretation] STOCH < value 20
    df_ohlcv_tmp['Indic_Stoch_14_3_3_inf_20'] = (df_ohlcv_tmp.Indic_Stoch_14_3_3_k < 20) & (df_ohlcv_tmp.Indic_Stoch_14_3_3_d < 20)
    
    # [Interpretation] STOCH diff
    df_ohlcv_tmp['Indic_Stoch_14_3_3_diff'] = df_ohlcv_tmp.Indic_Stoch_14_3_3_k - df_ohlcv_tmp.Indic_Stoch_14_3_3_d
    
    
    
    # [Interpretation] OBV in uptrend on last 3d : True / downtrend : False
    df_ohlcv_tmp['Indic_OBV_uptrend_3d'] = df_ohlcv_tmp.Indic_OBV.pct_change(periods=3 * 24) > 0
    
    # [Interpretation] OBV in uptrend on last 7d : True / downtrend : False
    df_ohlcv_tmp['Indic_OBV_uptrend_7d'] = df_ohlcv_tmp.Indic_OBV.pct_change(periods=7 * 24) > 0
    
    # [Interpretation] OBV in uptrend on last 15d : True / downtrend : False
    df_ohlcv_tmp['Indic_OBV_uptrend_15d'] = df_ohlcv_tmp.Indic_OBV.pct_change(periods=15 * 24) > 0
    
    # [Interpretation] OBV in uptrend on last 30d : True / downtrend : False
    df_ohlcv_tmp['Indic_OBV_uptrend_30d'] = df_ohlcv_tmp.Indic_OBV.pct_change(periods=30 * 24) > 0
    
    
    return df_ohlcv_tmp.drop(['open_price', 'high_price', 'low_price', 'close_price', 'volume_aggregated_1h'], axis=1) 

In [38]:
# ------------------ PRE-PROCESSING : Data retrieving & cleaning ------------------ #

# TODO : Replace with info from config file
connection = create_engine('postgresql://dbuser:algocryptos@localhost:9091/algocryptos')

# Crypto ids
id_cryptocompare_crypto = "7605"
id_cryptocompare_tether = "171986"
id_cryptocompare_bitcoin = "1182"

# --------------------------------
# OHLCV
# --------------------------------
df_ohlcv = get_dataset_ohlcv(connection, id_cryptocompare_crypto)
df_ohlcv = clean_dataset_ohlcv_spe(df_ohlcv)

df_ohlcv_tether = get_dataset_ohlcv(connection, id_cryptocompare_tether)
df_ohlcv_tether = clean_dataset_ohlcv_spe(df_ohlcv_tether)

df_ohlcv_bitcoin = get_dataset_ohlcv(connection, id_cryptocompare_bitcoin)
df_ohlcv_bitcoin = clean_dataset_ohlcv_spe(df_ohlcv_bitcoin)

df_ohlcv_1d = get_ohlcv_1d_plus_missing_infos(df_ohlcv, id_cryptocompare_crypto)

# --------------------------------
# REDDIT SUBSCRIBERS
# --------------------------------
# /!\ TODO : Mauvaise extrapolation sur dernières heures du nb de subscribers => function qui extrapole n colonnes
df_reddit = get_dataset_reddit(connection, id_cryptocompare_crypto)
df_reddit = df_reddit[df_reddit.reddit_subscribers.notnull()]
df_reddit = do_timestamp_tasks(df_reddit)
df_reddit = df_reddit.resample('1H').interpolate()
df_reddit['reddit_subscribers'] = df_reddit['reddit_subscribers'].astype(int)


# --------------------------------
# ALL CRYPTOS
# --------------------------------
df_all_cryptos = get_dataset_all_cryptos(connection)
df_all_cryptos = clean_dataset_ohlcv_std(df_all_cryptos, columns_name=['global_volume_usd_1h', 'global_market_cap_usd'])
#df_all_cryptos = clean_dataset_ohlcv_std(df_all_cryptos, columns_name=['global_market_cap_usd'], do_ts_tasks=False)


# --------------------------------
# GOOGLE TREND
# --------------------------------
# crypto - last month
df_google_trend_crypto_1m = get_dataset_google_trend(connection, id_cryptocompare_crypto, '_1m')
df_google_trend_crypto_1m = clean_dataset_google_trend(df_google_trend_crypto_1m)

# crypto - 5 years
df_google_trend_crypto_5y = get_dataset_google_trend(connection, id_cryptocompare_crypto, '')
df_google_trend_crypto_5y = clean_dataset_google_trend(df_google_trend_crypto_5y)

# bitcoin - last month
df_google_trend_bitcoin_1m = get_dataset_google_trend(connection, id_cryptocompare_bitcoin, '_1m')
df_google_trend_bitcoin_1m = clean_dataset_google_trend(df_google_trend_bitcoin_1m)

# bitcoin - 5 years
df_google_trend_bitcoin_5y = get_dataset_google_trend(connection, id_cryptocompare_bitcoin, '')
df_google_trend_bitcoin_5y = clean_dataset_google_trend(df_google_trend_bitcoin_5y)

In [52]:
# figure
fig1 = plt.figure(figsize=(15,15))
#df.close_price.plot()
#df.volume_aggregated_1h.plot()
#df2.volume_aggregated.plot(secondary_y=True)

<matplotlib.figure.Figure at 0x1288d772358>

In [49]:
# ------------------ PRE-PROCESSING : Feature engineering ------------------ #
df_reddit = feature_engineering_reddit(df_reddit)
df_ohlcv_fe = feature_engineering_ohlcv(df_ohlcv)
df_ohlcv_tether_fe = feature_engineering_ohlcv(df_ohlcv_tether)
df_ohlcv_bitcoin_fe = feature_engineering_ohlcv(df_ohlcv_bitcoin)
df_technical_analysis = feature_engineering_technical_analysis(df_ohlcv, df_ohlcv_1d)
df_all_cryptos = feature_engineering_ohlcv_all_cryptos(df_all_cryptos)
df_google_trend_crypto_1m = feature_engineering_google_trend(df_google_trend_crypto_1m, 'm')
df_google_trend_bitcoin_1m = feature_engineering_google_trend(df_google_trend_bitcoin_1m, 'm')
df_google_trend_crypto_5y = feature_engineering_google_trend(df_google_trend_crypto_5y, 'y')
df_google_trend_bitcoin_5y = feature_engineering_google_trend(df_google_trend_bitcoin_5y, 'y')

# Join dfs
#df_ohlcv = df_ohlcv.join(df_ohlcv_tether[['close_price','volume_aggregated_1h']], rsuffix='_tether') => Subset only
df_ohlcv_fe = df_ohlcv_fe.join(df_ohlcv_tether_fe, rsuffix='_tether')
df_ohlcv_fe = df_ohlcv_fe.join(df_ohlcv_bitcoin_fe, rsuffix='_bitcoin')

df_global = df_ohlcv_fe.join(df_technical_analysis)
df_global = df_global.join(df_reddit)
df_global = df_global.join(df_all_cryptos)
df_global = df_global.join(df_google_trend_crypto_1m, rsuffix='_crypto_1m')
df_global = df_global.join(df_google_trend_bitcoin_1m, rsuffix='_bitcoin_1m')
df_global = df_global.join(df_google_trend_crypto_5y, rsuffix='_crypto_5y')
df_global = df_global.join(df_google_trend_bitcoin_5y, rsuffix='_bitcoin_5y')
df_global.resample('1H').interpolate()
df_global.reddit_subscribers = df_global.reddit_subscribers.interpolate(method='linear', limit_area='outside')

# remove data added only to be able to calcul indicators, etc.
df_global = df_global[df_ohlcv.index.min():df_global.index.max()]

In [50]:
df_global

Unnamed: 0_level_0,open_price,high_price,low_price,close_price,volume_aggregated_1h,volume_aggregated_24h,close_price_variance_3h,close_price_variance_12h,close_price_variance_24h,close_price_variance_7d,...,value_standalone_bitcoin_5y,value_compared_to_standard_bitcoin_5y,gg_trend_value_standalone_pct_change_2m_bitcoin_5y,gg_trend_value_compared_pct_change_2m_bitcoin_5y,gg_trend_value_standalone_pct_change_3m_bitcoin_5y,gg_trend_value_compared_pct_change_3m_bitcoin_5y,gg_trend_value_standalone_pct_change_6m_bitcoin_5y,gg_trend_value_compared_pct_change_6m_bitcoin_5y,gg_trend_value_standalone_pct_change_1y_bitcoin_5y,gg_trend_value_compared_pct_change_1y_bitcoin_5y
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-12-05 12:00:00+00:00,460.950000,460.960000,457.190,457.340000,185626.410000,,,,,,...,89.0,89.0,4.562500,4.562500,3.684211,3.684211,6.416667,6.416667,28.666667,28.666667
2017-12-05 13:00:00+00:00,457.460000,460.120000,456.600,457.090000,189306.047200,,,,,,...,89.0,89.0,4.562500,4.562500,3.684211,3.684211,6.416667,6.416667,28.666667,28.666667
2017-12-05 14:00:00+00:00,457.160000,460.230000,456.870,458.330000,172247.460000,,0.430033,,,,...,89.0,89.0,4.562500,4.562500,3.684211,3.684211,6.416667,6.416667,28.666667,28.666667
2017-12-05 15:00:00+00:00,458.330000,459.200000,457.550,458.550000,115921.102622,,0.619600,,,,...,89.0,89.0,4.562500,4.562500,3.684211,3.684211,6.416667,6.416667,28.666667,28.666667
2017-12-05 16:00:00+00:00,458.550000,458.560000,451.470,454.240000,164242.051200,,5.892100,,,,...,88.0,88.0,4.500000,4.500000,3.631579,3.631579,6.333333,6.333333,28.333333,28.333333
2017-12-05 17:00:00+00:00,454.060000,457.310000,453.710,457.000000,149751.686400,,4.766033,,,,...,88.0,88.0,4.500000,4.500000,3.631579,3.631579,6.333333,6.333333,28.333333,28.333333
2017-12-05 18:00:00+00:00,457.020000,461.960000,456.820,461.390000,143597.300000,,13.002033,,,,...,88.0,88.0,4.500000,4.500000,3.631579,3.631579,6.333333,6.333333,28.333333,28.333333
2017-12-05 19:00:00+00:00,456.795000,461.305000,456.560,460.930000,156635.875000,,5.821433,,,,...,88.0,88.0,4.500000,4.500000,3.631579,3.631579,6.333333,6.333333,28.333333,28.333333
2017-12-05 20:00:00+00:00,456.570000,460.650000,456.300,460.470000,169674.450000,,0.211600,,,,...,88.0,88.0,4.176471,4.176471,3.631579,3.631579,6.333333,6.333333,28.333333,28.333333
2017-12-05 21:00:00+00:00,460.420000,462.860000,458.520,462.740000,127854.272250,,1.440100,,,,...,88.0,88.0,4.176471,4.176471,3.631579,3.631579,6.333333,6.333333,28.333333,28.333333


In [45]:
df_global.index

DatetimeIndex(['2017-12-05 12:00:00+00:00', '2017-12-05 13:00:00+00:00',
               '2017-12-05 14:00:00+00:00', '2017-12-05 15:00:00+00:00',
               '2017-12-05 16:00:00+00:00', '2017-12-05 17:00:00+00:00',
               '2017-12-05 18:00:00+00:00', '2017-12-05 19:00:00+00:00',
               '2017-12-05 20:00:00+00:00', '2017-12-05 21:00:00+00:00',
               ...
               '2018-09-23 23:00:00+00:00', '2018-09-24 00:00:00+00:00',
               '2018-09-24 01:00:00+00:00', '2018-09-24 02:00:00+00:00',
               '2018-09-24 03:00:00+00:00', '2018-09-24 04:00:00+00:00',
               '2018-09-24 05:00:00+00:00', '2018-09-24 06:00:00+00:00',
               '2018-09-24 07:00:00+00:00', '2018-09-24 08:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='timestamp', length=7029, freq='H')

In [None]:
# figure
fig1 = plt.figure(figsize=(15,15))
#df_global.Indic_MA_30d.plot(label='close_price', color='black')
#df_global.Indic_MA_30d.plot(secondary_y=True, color='red')

#df_ohlcv_1d.Indic_Macd_12_26_9_macdsignal.plot(secondary_y=True, color='blue')
#df_ohlcv_1d.Indic_Macd_12_26_9_macdhist.plot(secondary_y=True, color='red')

#df_ohlcv_1d.close_price.plot()
#df_ohlcv_1d.Indic_OBV.plot(secondary_y=True)

In [None]:
df_global