In [4]:
import pandas as pd 
import numpy as np 

from itertools import product 
import matplotlib.pyplot as plt 

from sklearn.datasets import make_classification

In [5]:
def get_test_data(n_features=40, n_informative=10, n_redundant=10, n_samples=10000):
    X, cont = make_classification(n_samples=n_samples, n_features=n_features,
                                  n_informative=n_informative, n_redundant=n_redundant,
                                  random_state=0, shuffle=False)
    
    time_idx = pd.DatetimeIndex(periods=n_samples, freq=pd.tseries.offsets.BDay(),
                                end=pd.datetime.today())
    X = pd.DataFrame(X, index=time_idx)
    cont = pd.Series(cont, index=time_idx).to_frame('bin')
    # Create name of columns
    columns = ['I_' + str(i) for i in range(n_informative)]
    columns += ['R_' + str(i) for i in range(n_redundant)]
    columns += ['N_' + str(i) for i in range(n_features - len(columns))]
    X.columns = columns
    cont['w'] = 1. / cont.shape[0]
    cont['t1'] = pd.Series(cont.index, index=cont.index)
    return X, cont

In [8]:
X, cont = get_test_data()

  


In [11]:
X.head()

Unnamed: 0,I_0,I_1,I_2,I_3,I_4,I_5,I_6,I_7,I_8,I_9,...,N_10,N_11,N_12,N_13,N_14,N_15,N_16,N_17,N_18,N_19
1981-02-12 14:04:36.062179,2.84374,0.456554,0.171107,-4.511382,0.27899,-3.474726,2.95555,2.698865,1.54244,2.198168,...,-0.330515,-0.845502,-1.477466,1.217536,0.304644,1.557365,0.202843,0.16011,0.933805,-0.132272
1981-02-13 14:04:36.062179,3.561541,-1.566097,3.342813,-1.938909,2.075749,-3.486711,0.494908,0.309615,1.059439,-0.792433,...,-0.020384,-0.751467,0.212077,0.285038,0.125461,0.203534,-0.376495,-0.93878,-0.142879,0.533263
1981-02-16 14:04:36.062179,7.699248,-3.030124,-0.859302,-0.033351,1.113719,-0.877844,2.344033,4.089113,2.287786,0.611413,...,0.744056,0.914181,1.586483,0.692802,-0.953431,0.67936,0.565153,0.219302,-1.110504,-1.086061
1981-02-17 14:04:36.062179,-0.149801,-3.182187,2.695894,1.359997,2.992416,-0.417971,-1.214058,1.268313,-3.720913,-2.580578,...,-1.960632,-2.064914,1.258648,-1.031856,0.645146,-0.0639,0.305844,0.371489,3.218969,0.867178
1981-02-18 14:04:36.062179,-2.157903,0.04638,0.697217,-1.012036,1.856002,-2.311465,2.715493,0.444433,-1.92179,-2.472372,...,-0.841121,0.081347,-2.587682,-0.416436,-1.077859,-0.428086,-0.183735,-0.434254,-2.124955,-0.709056


In [12]:
cont.head()

Unnamed: 0,bin,w,t1
1981-02-12 14:04:36.062179,0,0.0001,1981-02-12 14:04:36.062179
1981-02-13 14:04:36.062179,0,0.0001,1981-02-13 14:04:36.062179
1981-02-16 14:04:36.062179,0,0.0001,1981-02-16 14:04:36.062179
1981-02-17 14:04:36.062179,0,0.0001,1981-02-17 14:04:36.062179
1981-02-18 14:04:36.062179,0,0.0001,1981-02-18 14:04:36.062179


In [19]:
def get_e_vec(dot, var_thres):
    e_val, e_vec = np.linalg.eigh(dot)
    # Descending order
    idx = e_val.argsort()[::-1]
    e_val = e_val[idx]
    e_vec = e_vec[:, idx]
    # Use only positive ones
    e_val = pd.Series(e_val, index=['PC_' + str(i + 1) for i in range(e_val.shape[0])])
    e_vec = pd.DataFrame(e_vec, index=dot.index, columns=e_val.index)
    e_vec = e_vec.loc[:, e_val > 0]
    e_val = e_val.loc[e_val > 0]
    # Reduce dimension with threashold
    cum_var = e_val.cumsum() / e_val.sum()
    dim = cum_var.values.searchsorted(var_thres)
    e_val = e_val.iloc[:dim+1]
    e_vec = e_vec.iloc[:, :dim+1]
    return e_val, e_vec


def orth_feats(dfX, var_thres=.95):
    dfZ = dfX.sub(dfX.mean(), axis=1).div(dfX.std(), axis=1)
    dot = pd.DataFrame(np.dot(dfZ.T, dfZ), index=dfX.columns, columns=dfX.columns)
    e_val, e_vec = get_e_vec(dot, var_thres)
    dfP = pd.DataFrame(np.dot(dfZ, e_vec), index=dfZ.index, columns=e_vec.columns)
    return dfP

In [20]:
dfP = orth_feats(X)

In [21]:
dfP.shape

(10000, 28)

In [27]:
dfP.columns

Index(['PC_1', 'PC_2', 'PC_3', 'PC_4', 'PC_5', 'PC_6', 'PC_7', 'PC_8', 'PC_9',
       'PC_10', 'PC_11', 'PC_12', 'PC_13', 'PC_14', 'PC_15', 'PC_16', 'PC_17',
       'PC_18', 'PC_19', 'PC_20', 'PC_21', 'PC_22', 'PC_23', 'PC_24', 'PC_25',
       'PC_26', 'PC_27', 'PC_28'],
      dtype='object')

In [22]:
dfP.head()

Unnamed: 0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,PC_10,...,PC_19,PC_20,PC_21,PC_22,PC_23,PC_24,PC_25,PC_26,PC_27,PC_28
1981-02-12 14:04:36.062179,-2.247346,3.289565,-1.367301,2.863059,1.412666,0.799908,-0.213012,-0.331433,-0.8663,-0.032951,...,-1.944196,0.481867,-0.060209,0.277116,0.534618,-0.986281,0.384909,0.801349,-0.828607,1.12123
1981-02-13 14:04:36.062179,-0.956982,-0.370077,-0.070016,3.012735,-0.317126,-0.332048,0.77146,0.385234,-0.124513,-0.067744,...,0.769808,1.209847,0.564018,0.825132,-0.753131,0.379044,0.456504,-1.176804,0.593204,-0.354478
1981-02-16 14:04:36.062179,-4.43208,2.690585,1.475541,1.15708,-0.586217,-0.56181,-0.509762,1.904502,-1.081916,-1.208494,...,0.449401,-0.567609,0.040427,0.911582,-0.27104,0.410972,-2.323971,-0.226183,-2.398908,0.385578
1981-02-17 14:04:36.062179,2.366804,-1.419525,0.356341,-0.679244,-2.952952,-0.414377,-0.889363,-2.16338,-0.4547,-1.256352,...,-2.320637,-0.228787,-1.007314,1.230794,-0.694026,-1.360371,0.97062,-1.304913,1.543305,0.018166
1981-02-18 14:04:36.062179,1.104342,-0.693122,-0.555143,1.28374,-0.570558,0.30902,2.019225,0.153222,-0.867766,0.339598,...,2.428816,-0.200491,1.53322,0.077449,-1.194174,0.406989,1.561206,0.957596,0.192124,2.333219


In [28]:
from sklearn.metrics import log_loss, accuracy_score

from finance_ml.model_selection import PurgedKFold
from finance_ml.model_selection import cv_score


ModuleNotFoundError: No module named 'finance_ml'