<a href="https://colab.research.google.com/github/diogosaucedo/Praticas-de-IA/blob/main/Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install talib-binary

Collecting talib-binary
[?25l  Downloading https://files.pythonhosted.org/packages/00/61/a68a9276a3c166df8717927780d994496ee4cb5299903a409f93689a2b4e/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl (2.4MB)
[K     |████████████████████████████████| 2.4MB 28.6MB/s 
Installing collected packages: talib-binary
Successfully installed talib-binary-0.4.19


In [None]:
import numpy as np
from talib.abstract import *
import pickle
import pandas as pd

In [None]:
minutes = 5 # forecast horizon in minutes
test_size= 0.2 # 20% of the dataframe for testing

# load dataframe
with open("EURUSD", "rb") as f:
    inputs = pickle.load(f) # inputs is a dict containing open, high, low, closed, volume
dataframe = pd.DataFrame.from_dict(inputs)

In [None]:
dataframe['EMAC'] = EMA(dataframe['close'], timeperiod=50)
dataframe['EMAO'] = EMA(dataframe['open'], timeperiod=50)
dataframe['EMAH'] = EMA(dataframe['high'], timeperiod=50)
dataframe['EMAL'] = EMA(dataframe['low'], timeperiod=50)
dataframe.fillna(method="ffill", inplace= True) # ffill: propagate last valid observation forward to next valid
dataframe.fillna(method="bfill",inplace= True) # bfill: use next valid observation to fill gap.
dataframe['BB_UP'], dataframe['BB_MID'], dataframe['BB_LOW'] = BBANDS(dataframe['EMAC'], timeperiod=20, nbdevup=2.5, nbdevdn=2.5, matype=0)
dataframe['EMA'] = EMA(dataframe['close'], timeperiod=100)
dataframe['RSI'] = RSI(dataframe['EMAC'], timeperiod=14)
dataframe['CCI'] = CCI(dataframe['EMAH'], dataframe['EMAL'], dataframe['EMAC'], timeperiod=14)
dataframe['WILLR'] = WILLR(dataframe['EMAC'], dataframe['EMAL'], dataframe['EMAC'], timeperiod=14)
dataframe['macd'], dataframe['macdsignal'], dataframe['macdhist'] = MACD(dataframe['EMAC'], fastperiod=14, slowperiod=24, signalperiod=14)
dataframe['OBV'] = OBV(dataframe['EMAC'], dataframe['volume']) # On Balance Volume
dataframe['ROC'] = ROC(dataframe['EMAC'], timeperiod=14)
dataframe['fastk'], dataframe['fastd'] = STOCHRSI(dataframe['EMAC'], timeperiod=14, fastk_period=5, fastd_period=3, fastd_matype=0)
dataframe.fillna(method="ffill", inplace= True) # ffill: propagate last valid observation forward to next valid
dataframe.fillna(method="bfill",inplace= True) # bfill: use next valid observation to fill gap.

In [None]:
dataframe['tendencia'] = dataframe.close - dataframe.EMA

In [None]:
temp_df = dataframe['close'].shift(-minutes) # Shift index by desired number of periods
temp_df.fillna(method="ffill", inplace= True)
temp_df.fillna(method="bfill", inplace= True)
# 1 = price increase, 0 = price decrease
classes = np.where(temp_df > dataframe.close, 1, 0)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer

In [None]:
# Random division
X_train,X_test,y_train,y_test=train_test_split(dataframe, classes, test_size= test_size, random_state=101)
# Exponential smoothing
scaler=QuantileTransformer()
scaler.fit(X_train)
X_train = pd.DataFrame(data=scaler.transform(X_train),columns = X_train.columns, index=X_train.index)
X_test = pd.DataFrame(data=scaler.transform(X_test),columns = X_test.columns,index=X_test.index)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectFromModel

In [None]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [None]:
thresholds = [thres for thres in sorted(rf.feature_importances_) if thres != 0]  # Use feat. with >0 importance
feature_names = [i for i in list(X_train.columns)]
roc_scores = {}
for thresh in thresholds:
  selection = SelectFromModel(rf, threshold=thresh, prefit=True)
  select_X_train = selection.transform(X_train)
  selection_model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
  selection_model.fit(select_X_train, y_train)
  select_X_test = selection.transform(X_test)  # eval model
  y_pred = selection_model.predict(select_X_test)
  score = selection_model.score(select_X_test, y_test)
  roc = roc_auc_score(y_test, y_pred)
  roc_scores[selection.threshold] = [roc, score]


In [None]:
for i in roc_scores:
  print(i, roc_scores[i])


0.008981146338858093 [0.7859101606766326, 0.7863611111111111]
0.03150802408214881 [0.7867423189372287, 0.7872222222222223]
0.03337831980821178 [0.7877039756398417, 0.7881666666666667]
0.03438941186409928 [0.7868482199951572, 0.7872777777777777]
0.034641747868890245 [0.7883117765681842, 0.7887777777777778]
0.03516435360373201 [0.7888947820248924, 0.7893611111111111]
0.0355902499287253 [0.7873728835289442, 0.7878333333333334]
0.0363086763842318 [0.7833798800637788, 0.7838888888888889]
0.036445387071040496 [0.7807239546513242, 0.7812222222222223]
0.03651589739711096 [0.7814468769711319, 0.7819166666666667]
0.03674350538973208 [0.7781288723239356, 0.7786666666666666]
0.03739495254154087 [0.7716842976571188, 0.7722777777777777]
0.03745304164842909 [0.7600922201332956, 0.7607222222222222]
0.038548269640931374 [0.7491043492751769, 0.7497777777777778]
0.039989674558664146 [0.7393564063914931, 0.7400277777777777]
0.04620810034825805 [0.6877646445699139, 0.6885833333333333]
0.050993308157449777 

In [None]:
best_thresh = 0.03651589739711096
fs = SelectFromModel(rf, threshold=best_thresh, prefit=True)
X_train_trans_ = fs.transform(X_train)
X_test_trans_ = fs.transform(X_test)
selected_feat= X_train.columns[(fs.get_support())]
print(selected_feat)
print(len(selected_feat))

Index(['close', 'volume', 'EMAL', 'BB_UP', 'BB_LOW', 'EMA', 'RSI', 'CCI',
       'macd', 'macdsignal', 'macdhist', 'OBV', 'ROC', 'fastd', 'tendencia'],
      dtype='object')
15


In [None]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf.fit(X_train_trans_, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [None]:
predictions = rf.predict(X_test_trans_)
score = rf.score(X_test_trans_, y_test)
print('Score: ', round(score,2))

Score:  0.77
