### Imports

In [1]:
import pandas as pd
import string
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from feature_engine.selection import DropDuplicateFeatures, DropConstantFeatures

### Reading Data from Excel daily_dataframe_main

In [2]:
# Takes 1 min 30 secs to run
Variables = pd.read_excel('daily_dataframe_main.xlsx', header=[0,1], sheet_name='Variables',index_col=0)
Response = pd.read_excel('daily_dataframe_main.xlsx',sheet_name= 'Response' ,index_col=0)

# Just using Unadjusted data for now
Response = Response[['LB_Close','Close_ret','Close_Up_Down']]

### Imputing Values
<div style="color: DarkBlue; font-size:22px;" class="alert alert-block alert-warning"> 
<b>Check:</b> Backfilling and then frontfilling Variables DataFrame, and filling zeros with Median? - Check if it makes sense
</div>


In [3]:
Variables.fillna(method='bfill', inplace=True)
# Variables.fillna(method='ffill', inplace=True)
# Replacing 0 with Median Values
# Variables.replace(to_replace=0, method='bfill', inplace=True) 
# Variables.replace(to_replace=0, method=Variables.median(), inplace=True) 

<div style="color: DarkBlue; font-size:22px;" class="alert alert-block alert-warning"> 
<b>To Do:</b> For now using custom based method - but change it eventually with Feature-engine etc. Check which method makes more sense
</div>

In [4]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.
        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)


Variables = DataFrameImputer().fit_transform(Variables)


### Replacing punctuation strings in column names

In [5]:
cols = [str(w).translate(str.maketrans('', '', string.punctuation)).lstrip().rstrip() for w in Variables.columns]
Variables.columns = cols

### Concatenating Variables & Response
    -- Choose Response Variable here - LB_Close, Close_ret, Close_Up_Down, OR Adjusted Data from above

In [6]:
# Choose Target Variable here - LB_Close	Close_ret	Close_Up_Down OR Adjusted Data from above
Target_choice = 'Close_Up_Down'
data = pd.concat([Variables, Response[Target_choice]], axis=1)

### Remove Variables with all NaNs
<div style="color: Green; font-size:22px;" class="alert alert-block alert-warning"> 
<b>To Do:</b> Check why these variables have NaNs
</div>


In [7]:
data.drop(data.columns[data.isna().all()].to_list(), axis=1, inplace=True)
data.drop(data.columns[data.isnull().all()].to_list(), axis=1, inplace=True)
pd.Series(data.dtypes.values).value_counts()

float64    659
int64        2
dtype: int64

### Splitting the Dataset in 3 - Training, Testing & Validation

In [8]:
Validation_date_start = '2022-01-01'
Validation = data[data.index >= Validation_date_start]
data = data[data.index < Validation_date_start]
X_val,  y_val  = Validation.iloc[:,:-1], Validation.iloc[:,-1] # last column is the response variable

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=data.columns[-1], axis=1),  # drop the target
    data[Target_choice],  # just the target
    test_size=0.20, shuffle = False)

X_train.shape, X_test.shape, y_train.shape, y_test.shape, X_val.shape, y_val.shape

((15268, 660), (3817, 660), (15268,), (3817,), (168, 660), (168,))

### Random Forest

In [9]:
X_train_sample = X_train.iloc[: ,15:].copy(deep= True)
# X_train_sample = X_train.copy(deep= True)

features = list(X_train_sample.columns)

rf = RandomForestClassifier(n_estimators=200, max_depth=8)
rf.fit(X_train_sample[features].fillna(0), y_train)

importance = pd.concat([pd.Series(features),pd.Series(rf.feature_importances_)], axis=1)
importance.columns = ['feature', 'importance']

In [10]:
y_pred = rf.predict(X_test[features].fillna(0))
accuracy_score(y_test.fillna(0), y_pred)

0.33822373591826044

In [11]:
RF_importance_df = importance.sort_values(by='importance', ascending=False)
RF_importance_df = RF_importance_df[RF_importance_df['importance'] > 0.005]
RF_importance_df.head()

Unnamed: 0,feature,importance
144,FREDDaily Close CBOE Volatility Index VIX,0.01434
75,TreasuryTerms ACMTP02,0.011076
105,Yahoo IFPTO,0.010578
106,Yahoo WY,0.010478
78,TreasuryTerms ACMTP05,0.009865
