In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.offline as pof
import scipy.stats as sps
sns.set()

plt.rc('font', size=30)
plt.rc('axes', titlesize=30)
plt.rc('axes', labelsize=30)
plt.rc('xtick', labelsize=30)
plt.rc('ytick', labelsize=30)
plt.rc('legend', fontsize=30)
plt.rc('figure', titlesize=30)

from sklearn.base import BaseEstimator
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.covariance import MinCovDet
import cvxpy as cvx
import yfinance as yf
from tqdm import tqdm

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from warnings import filterwarnings
filterwarnings('ignore')

In [16]:
from rpy2.robjects.packages import importr
# import R's "base" package
base = importr('base')

# import R's "utils" package
utils = importr('utils')

# import rpy2's package module
import rpy2.robjects.packages as rpackages

# import R's utility package
utils = rpackages.importr('utils')

# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list

# R package names
packnames = ('NMF', 'nsprcomp', 'BiocManager')

# R vector of strings
from rpy2.robjects.vectors import StrVector

# Selectively install what needs to be install.
# We are fancy, just because we can.
names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
    utils.install_packages(StrVector(names_to_install))

nsprcomp = importr('nsprcomp')

# Allow conversion
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [17]:
def decrease_risk(mu, Sigma, R=2e-3):
    p = len(mu)
    w = cvx.Variable(p)
    obj = cvx.Minimize(1/2 * cvx.quad_form(w, Sigma))
    
    equal_constraints_1 = [mu.T @ w == R]
    equal_constraints_2 = [np.ones(p) @ w == 1]
    eyes = np.eye(p)
    nonequal_constraints = [eye @ w >= 0 for eye in eyes]
    constraints = equal_constraints_1 + equal_constraints_2 + nonequal_constraints
    
    problem = cvx.Problem(obj, constraints=constraints)
    result = problem.solve(verbose=False, solver="SCS")
    
    opt_weights = w.value
    return opt_weights

def count_portfolio_return(columns, weights, test):
    portfolio_return = 0
    for w, col in zip(weights, columns):
        one_company_return = w * (test[col] + 1).cumprod()
        portfolio_return += one_company_return
    return portfolio_return

def return_weights(df, R=2e-3):
    mean = df[df.columns].mean()
    df_sorted = df[mean.sort_values(ascending=False).index]
    
    mu = mean.sort_values(ascending=False).to_numpy()
    Sigma = df_sorted[df_sorted.columns].cov()
    
    w = decrease_risk(mu, Sigma, R)
    return w

def get_mu(df):
    mean = df[df.columns].mean()    
    mu = mean.sort_values(ascending=False).to_numpy()
    return mu

def get_Sigma(df):
    Sigma = df[df.columns].cov()
    return Sigma

def plot_returns(names, returns, title, save=False):
    fig = go.Figure()
    
    for name, portfolio_return in zip(names, returns):
        fig.add_trace(go.Scatter(x=portfolio_return.index, y=portfolio_return, name=name))
            
    fig.update_layout(title=title, xaxis_title='date', 
                  yaxis_title='return')
    fig.show()
    if save:
        pof.plot(fig, filename=f'{title}.html', auto_open=False)

In [18]:
def make_dim_red(df, n_comp = 2, window_size = 10):
    components_ind = 6
    
    time_size = len(df.index)
    
    new_prices = list()

    for i in range(0, time_size, window_size):
        var = nsprcomp.nsprcomp(df[i : i + window_size].T, ncomp=n_comp, center=False, scale=True, nneg=True)
        new_prices.append(SVP(var[components_ind].T))
    
    new_prices = np.vstack(new_prices)
    df_new_prices = pd.DataFrame(data=new_prices, index=np.arange(0, len(new_prices)), columns=df.columns)
    new_returns = df_new_prices.apply(lambda x: x.pct_change(1).fillna(0), axis=0)
    new_returns = new_returns.drop(new_returns.index[0])
    
    return new_returns

def SVP(components):
    r = len(components)
    s = np.zeros(r)   
    for i in range(r):
        s[i] = sum((components[i] - np.mean(components))**2) / (len(components[0]) - 1)
    
    c = np.zeros(r)
    for i in range(r):
        c[i] = s[i] / sum(s)
        
    ans = components.T @ c
    return ans

def NMF_dim_red(df_prices, n_comp=1, window_size=5):
    new_prices = list()
    heights = list()

    for i in range(0, len(df_prices.index), window_size):
        model = NMF(n_components=2)
        W = model.fit_transform(df_prices[i : i + window_size].T)
        H = model.components_
        heights.append(H)
        new_prices.append(SVP(W.T))

    new_prices = np.vstack(new_prices)
    df_new_prices = pd.DataFrame(data=new_prices)
    new_returns = df_new_prices.apply(lambda x: x.pct_change(1).fillna(0), axis=0)
    return new_returns

In [19]:
def sharpe_ratio(returns):
    return np.mean(returns - 1) / np.std(returns - 1)

def VAR_ratio(returns, quantile=0.05):
    return np.quantile(returns, quantile)

def sortino_ratio(returns, N=255,rf=0.01):
    pd_r = pd.DataFrame(returns)
    pd_r = pd_r.pct_change()
    mean = pd_r.mean() * N -rf
    std_neg = pd_r[pd_r<0].std() * np.sqrt(N)
    return (mean/std_neg)[pd_r.columns[0]]

In [20]:
def NPCA_param_search(prices_df, n_comp_list, window_size_list, cv, metric=sortino_ratio, num_iteration=2):
    best_ratio = -1
    best_params = [0, 0]
    best_ratio_list = list()
    best_weights = list()
    
    for n_comp in tqdm(n_comp_list):
        for window_size in tqdm(window_size_list):
            if n_comp > window_size:
                    continue
                    
            ratio_list = list()
            weights_list = list()

            for i in range(num_iteration):               
                for train_index, test_index in tscv.split(prices_df):
                    train = prices.loc[prices.index[train_index]]
                    test = prices.loc[prices.index[test_index]]
                    test_returns = test.apply(lambda x: x.pct_change(1).fillna(0), axis=0)

                    try:
                        returns = make_dim_red(train, n_comp, window_size)
                        npca_w = return_weights(returns)
                        weights_list.append(npca_w)

                        npca_portfolio_return = count_portfolio_return(test_returns.columns.to_numpy(), npca_w, test_returns)
                        ratio_list.append(metric(npca_portfolio_return))
                    except Exception as e:
                        print('Error message: ', e)
                        continue

            cur_ratio = np.mean(ratio_list)
            if cur_ratio > best_ratio:
                best_ratio = cur_ratio
                best_params[0] = n_comp
                best_params[1] = window_size
                best_ratio_list = ratio_list
                best_weights = weights_list

    return best_params, best_weights, best_ratio_list

In [21]:
def NMF_param_search(prices_df, n_comp_list, window_size_list, cv, metric=sortino_ratio, num_iteration=2):
    best_ratio = -1
    best_params = [0, 0]
    best_ratio_list = list()
    best_weights = list()
    
    for n_comp in tqdm(n_comp_list):
        for window_size in tqdm(window_size_list):
            if n_comp > window_size:
                    continue
                    
            ratio_list = list()
            weights_list = list()

            for i in range(num_iteration):               
                for train_index, test_index in tscv.split(prices_df):
                    train = prices.loc[prices.index[train_index]]
                    test = prices.loc[prices.index[test_index]]
                    test_returns = test.apply(lambda x: x.pct_change(1).fillna(0), axis=0)

                    try:
                        returns = NMF_dim_red(train, n_comp, window_size)
                        nmf_w = return_weights(returns)
                        weights_list.append(nmf_w)

                        nmf_portfolio_return = count_portfolio_return(test_returns.columns.to_numpy(), nmf_w, test_returns)
                        ratio_list.append(metric(nmf_portfolio_return))
                    except Exception as e:
                        print('Error message: ', e)
                        continue

            cur_ratio = np.mean(ratio_list)
            if cur_ratio > best_ratio:
                best_ratio = cur_ratio
                best_params[0] = n_comp
                best_params[1] = window_size
                best_ratio_list = ratio_list
                best_weights = weights_list

    return best_params, best_weights, best_ratio_list