# Random Forest Classifier

## Get dataset

In [None]:
from finlab.ml import fundamental_features
dataset = fundamental_features()
dataset = dataset.dropna(thresh=int(len(dataset)*0.5), axis=1).dropna(how='any')
dataset.head()

## Add labels

In [None]:
features = dataset.columns

import finlab.ml as ml

ml.add_profit_prediction(dataset)
dataset = dataset.dropna()
dataset.head()

## Split Train Test dataset

In [None]:
select = dataset.index.get_level_values('date') < '2017'
dataset_train = dataset[select]
dataset_test = dataset[~select]

## Training

In [None]:
from sklearn.ensemble import RandomForestClassifier

cf = RandomForestClassifier(200)
cf.fit(dataset_train[features], dataset_train['return'] > 1)

## Prediction

In [None]:
import pandas as pd
prediction = cf.predict(dataset_test[features])

## Backtest

In [None]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")

%matplotlib inline

returns1 = dataset_test['return'][prediction == True]
dates = returns1.index.get_level_values('date')
returns1.groupby(dates).mean().cumprod().plot()

returns2 = dataset_test['return'][prediction == False]
dates = returns2.index.get_level_values('date')
returns2.groupby(dates).mean().cumprod().plot()

## Feature Importance

In [None]:
importance = pd.Series(cf.feature_importances_, index=features).sort_values(ascending=False)
importance

## New Strategy using feature importance

In [None]:
from finlab.data import Data
data = Data()
close = data.get('收盤價')
sma = close.rolling(60, min_periods=10).mean()
bias = close / sma

ml.add_feature(dataset, 'bias', bias)
dataset.head()

In [None]:
items = list(importance.index[:20])

def select(df):
    rank = df[items].rank(pct=True).sum(axis=1)
    return df[rank > rank.quantile(0.9)]['return'][df['bias'] > 1].mean()

dates = dataset.index.get_level_values('date')
dataset.groupby(dates).apply(select).cumprod().plot()
dataset['return'].groupby(dates).mean().cumprod().plot()