In [3]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import datetime

print(torch.cuda.is_available())

True


In [23]:
data_path = "./deep_learning/datasets/CharAll_na_rm_huge_train_variableall4_sentiment_full_new.npz"
split_lists = np.load('./deep_learning/sampling_folds/random_sampling_folds.npy', allow_pickle = True)
dataset = np.load(data_path)
data = dataset['data']

In [144]:
subset2col = {
	'flow+fund_mom+sentiment': list(range(56, 60))+[47],
	'fund_ex_mom_flow': [59]+ [x for x in range(46, 58) if x not in (list(range(54, 58))+[47])],
	'stock': range(46),
	'fund': range(46, 59),
	'fund+sentiment': range(46, 60),
	'stock+fund': range(59),
	'F_r12_2+sentiment': [58, 59],
	'stock+sentiment': [59]+list(range(0, 46)),
	'stock+fund+sentiment': range(60),
	'F_r12_2+flow+sentiment': [47, 58, 59]
}

def squeeze_data(data, UNK = -99.99):
	T, N, M = data.shape
	lists_considered = []    
	returns = data[:,:,0]    
	for i in range(N):      
		returns_i = returns[:,i]             
		if np.sum(returns_i!=UNK)>0:        
			lists_considered.append(i)         
	return data[:, lists_considered, :], lists_considered

class FirmChar:
	def __init__(self):
		self._category = ['Fund mom','Fund char', 'Fund Family', 'Sentiment']
		self._category2variables = {
			'Fund mom': ['F_ST_Rev', 'F_r2_1', 'F_r12_2'],
			'Fund char': ['ages', 'flow', 'exp_ratio', 'tna', 'turnover'],
			'Fund Family': ['Family_TNA', 'fund_no', 'Family_r12_2', 'Family_flow', 'Family_age'], 
			'Sentiment': ['sentiment', 'RecCFNAI', 'sentiment_lsq', 'sentiment_lad', 'CFNAI_orth', 'leading'], 
		}
		self._variable2category = {}
		for category in self._category:
			for var in self._category2variables[category]:
				self._variable2category[var] = category
		self._category2color = {
			'Fund mom': 'blue',
			'Fund char': 'plum',
			'Fund Family':'lime',
			'Sentiment':'darkgreen'
		}
		self._color2category = {value:key for key, value in self._category2color.items()}

	def getColorLabelMap(self):       
		return {var: self._category2color[self._variable2category[var]] for var in self._variable2category}


def get_data(data_path, split_list, subset):
	dataset = np.load(data_path)
	data = dataset['data']
	column_considered = [0]+[x+1 for x in subset2col[subset]]        
	data = data[:,:,column_considered]       
	data, list_considered = squeeze_data(data[split_list])
	return data, list_considered

def get_tensors(data, UNK = -99.99):
	ret = torch.tensor(data[:,:,0])
	individualFeature = torch.tensor(data[:,:,1:] )    
	macroFeature = torch.empty((data.shape[0], 0))
	mask = (ret != UNK)
	
	input_macro_tile = macroFeature.unsqueeze(1).repeat(1, ret.shape[1], 1)
	input_macro_masked = input_macro_tile[mask]
	input_masked = individualFeature[mask]
	input_concat = torch.concat([input_masked, input_macro_masked], dim=1)
	return_masked = ret[mask]
	
	return input_concat, return_masked, mask

def get_dataset(data_path, split_list, subset):
	datasets = []
	masks = []
	for split in split_list:
		data, list_considered = get_data(data_path, split, subset)
		input_concat, return_masked, mask = get_tensors(data)
		datasets.append(torch.utils.data.TensorDataset(input_concat, return_masked))
		masks.append(mask)
	return datasets, masks

def get_dataloader(datasets, batch_size, num_workers=4, shuffle=False):
	dataloaders = []
	for dataset in datasets:
		dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle)
		dataloaders.append(dataloader)
	return dataloaders

def get_crossval_dataloaders(data_path, split_lists, subset, batch_size, num_workers=4, shuffle=False):
	crossval_loaders = []
	masks = []
	for split_list in split_lists:	
		datasets, mask = get_dataset(data_path, split_list, subset)
		dataloaders = get_dataloader(datasets, batch_size, num_workers, shuffle)
		dict = {'datasets': datasets,
				'dataloaders': dataloaders}
		masks.append(mask)
		crossval_loaders.append(dict)
	return crossval_loaders, masks

def evaluate_sharpe(r_pred, r_masked, mask):
	portfolio = construct_long_short_portfolio(r_pred, r_masked, mask, low=0.1, high=0.1) # equally weighted
	return sharpe(portfolio)

def sharpe(r):
	return torch.mean(r / r.std())

def construct_decile_portfolios(w, R, mask, value=None, decile=10):
    N_i = torch.sum(mask.int(), dim=1)
    
    # Splitting the tensors based on cumulative sums
    w_split = torch.split(w, N_i.tolist())
    R_split = torch.split(R, N_i.tolist())

    value_weighted = False
    if value is not None:
        value_weighted = True
        value = value[mask]
        value_split = torch.split(value, N_i.tolist())

    portfolio_returns = []

    for j in range(mask.size(0)):  # Iterate over rows
        R_j = R_split[j]
        w_j = w_split[j]
        if value_weighted:
            value_j = value_split[j]
            R_w_j = [(R_j[k].item(), w_j[k].item(), value_j[k].item()) for k in range(N_i[j])]
        else:
            R_w_j = [(R_j[k].item(), w_j[k].item(), 1.0) for k in range(N_i[j])]

        # Sort by weights
        R_w_j_sorted = sorted(R_w_j, key=lambda t: t[1])

        n_decile = N_i[j] // decile
        R_decile = []
        for i in range(decile):
            R_decile_i = 0.0
            value_sum_i = 0.0
            for k in range(n_decile):
                R_decile_i += R_w_j_sorted[i * n_decile + k][0] * R_w_j_sorted[i * n_decile + k][2]
                value_sum_i += R_w_j_sorted[i * n_decile + k][2]
            R_decile.append(R_decile_i / value_sum_i)
        portfolio_returns.append(R_decile)

    return torch.tensor(portfolio_returns)


def construct_long_short_portfolio(w, R, mask, value=None, low=0.1, high=0.1, normalize=True):
	N_i = torch.sum(mask.int(), dim=1)

	# Splitting the tensors based on cumulative sums
	w_split = torch.split(w, N_i.tolist())
	R_split = torch.split(R, N_i.tolist())
	
	value_weighted = False
	if value is not None:
		value_weighted = True
		value_split = torch.split(value[mask], N_i.tolist())
	
	portfolio_returns = []
	
	for j in range(mask.size(0)):  # Iterate over rows
		R_j = R_split[j]
		w_j = w_split[j]
		if value_weighted:
			value_j = value_split[j]
			R_w_j = [(R_j[k].item(), w_j[k].item(), value_j[k].item()) for k in range(N_i[j])]
		else:
			R_w_j = [(R_j[k].item(), w_j[k].item(), 1.) for k in range(N_i[j])]
	
		# Sort by weights
		R_w_j_sorted = sorted(R_w_j, key=lambda t: t[1])
	
		# Calculate low and high portfolio returns
		n_low = int(low * N_i[j])
		n_high = int(high * N_i[j])
	
		portfolio_return_high = 0.0
		value_sum_high = 0.0
		if n_high > 0:
			for k in range(n_high):
				portfolio_return_high += R_w_j_sorted[-k - 1][0] * R_w_j_sorted[-k - 1][2]
				value_sum_high += R_w_j_sorted[-k - 1][2]
			if normalize:
				portfolio_return_high /= value_sum_high
	
		portfolio_return_low = 0.0
		value_sum_low = 0.0
		if n_low > 0:
			for k in range(n_low):
				portfolio_return_low += R_w_j_sorted[k][0] * R_w_j_sorted[k][2]
				value_sum_low += R_w_j_sorted[k][2]
			if normalize:
				portfolio_return_low /= value_sum_low
	
		portfolio_returns.append(portfolio_return_high - portfolio_return_low)
		
	return torch.tensor(portfolio_returns)

In [162]:
class Deep_Network(torch.nn.Module):
	'''
	The module class performs building network according to config
    '''
	def __init__(self, config):
		super(Deep_Network, self).__init__()
		# parses parameters of network from configuration
		self.dropout = config['dropout']
		self.num_layers = config['num_layers']
		self.hidden_dim = config['hidden_dim']
		self.input_dim = config['input_dim']
		
        # builds network
		self.hidden_layers = torch.nn.ModuleList()
		for i in range(self.num_layers):
			input_dim = self.input_dim if i == 0 else self.hidden_dim[i-1]
			self.hidden_layers.append(torch.nn.Linear(input_dim, self.hidden_dim[i]))
		
		self.dropout_layer = torch.nn.Dropout(self.dropout)
		self.output_layer = torch.nn.Linear(self.hidden_dim[-1], 1)
	
	def forward(self, X):
		for layer in self.hidden_layers:
			X = layer(X)
			X = torch.nn.functional.relu(X)
			X = self.dropout_layer(X)
		return self.output_layer(X).squeeze(-1)

In [163]:
config = {
	'data_path': "./deep_learning/datasets/CharAll_na_rm_huge_train_variableall4_sentiment_full_new.npz",
	'split_lists_path': './deep_learning/sampling_folds/random_sampling_folds.npy',
	'subset': 'fund+sentiment',
	'num_layers': 1,
	'hidden_dim': [2**6],
	'dropout': 0.95,
	'learning_rate': 0.001,
	'epochs': 512,
	'weighted_loss': False,
	'reg_l1': 0.0,
	'reg_l2': 0.01,
	'batch_size': 2048,
	'criteria': 'Factor_sharpe',
	'random_seed': 15,
	'device': 'cuda' if torch.cuda.is_available() else 'cpu',
	'num_workers': 4
}

config['split_lists'] = np.load(config['split_lists_path'], allow_pickle = True)
config['input_dim'] = len(subset2col[config['subset']])

In [164]:
crossval_loaders, masks = get_crossval_dataloaders(config['data_path'], config['split_lists'], 'fund+sentiment', batch_size=config['batch_size'])

for i in range(len(crossval_loaders)):
	print('Cross-validation fold {}'.format(i+1))
	for j in crossval_loaders[i]['datasets']:
		print(len(j))
	print('')

Cross-validation fold 1
208161
68181
130816

Cross-validation fold 2
198258
68181
140719

Cross-validation fold 3
208191
63344
135623



In [172]:
fold = 0
train, val, test = crossval_loaders[fold]['datasets']
train_loader, val_loader, test_loader = crossval_loaders[fold]['dataloaders']
train_mask, val_mask, test_mask = masks[fold]

In [173]:
model = Deep_Network(config).to(device=config['device'])
print(model)

Deep_Network(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=14, out_features=64, bias=True)
  )
  (dropout_layer): Dropout(p=0.95, inplace=False)
  (output_layer): Linear(in_features=64, out_features=1, bias=True)
)


In [None]:
# Training function
torch.manual_seed(config['random_seed'])
torch.cuda.manual_seed(config['random_seed'])
torch.cuda.manual_seed_all(config['random_seed'])
np.random.seed(config['random_seed'])

optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['reg_l2'])
criterion = torch.nn.MSELoss()

best_model_state = model.state_dict()
best_val_loss = torch.tensor(float('inf'))
best_val_sharpe = torch.tensor(-float('inf'))
sharpe_train = []
sharpe_val = []
loss_train = []
loss_val = []

time_start = time.time()
for epoch in range(config['epochs']):
	epoch_loss = torch.tensor(0.0)
	y_train = []
	y_pred_train = []
	for i, (X, y) in enumerate(train_loader):
		X, y = X.float().to(config['device']), y.float().to(config['device'])
		optimizer.zero_grad()
		y_pred = model(X)
		
		loss = criterion(y_pred, y)	
		loss.backward()
		optimizer.step()	
		epoch_loss += loss.detach().cpu().item()
		
		y_train.append(y.detach().cpu())
		y_pred_train.append(y_pred.detach().cpu())
		
	epoch_loss /= i + 1
	y_train = torch.cat(y_train, dim=0)
	y_pred_train = torch.cat(y_pred_train, dim=0)
	
	sharpe_train_value = evaluate_sharpe(y_pred_train, y_train, train_mask)
	epoch_sharpe = sharpe_train_value.detach().cpu().item()
	sharpe_train.append(epoch_sharpe)
	loss_train.append(epoch_loss)
	
	# Validation
	with torch.no_grad():
		epoch_val_loss = torch.tensor(0.0)
		y_val = []
		y_pred_val = []
		for i, (X, y) in enumerate(val_loader):
			X, y = X.float().to(config['device']), y.float().to(config['device'])
			y_pred = model(X)
			loss = criterion(y_pred, y)
			epoch_val_loss += loss.detach().cpu().item()
			
			y_val.append(y.detach().cpu())
			y_pred_val.append(y_pred.detach().cpu())
			
	epoch_val_loss /= i + 1
	y_val = torch.cat(y_val, dim=0)
	y_pred_val = torch.cat(y_pred_val, dim=0)
	
	sharpe_val_value = evaluate_sharpe(y_pred_val, y_val, val_mask)
	epoch_val_sharpe = sharpe_val_value.detach().cpu().item()
	sharpe_val.append(epoch_val_sharpe)
	loss_val.append(epoch_val_loss)
	
	if epoch <= 50 or epoch % 10 == 0:
		print('Epoch {} - Training Loss: {:.8f}, Val Loss: {:.8f}, Train Sharpe: {:.8f}, Validation Sharpe: {:.8f}'.format(epoch+1, epoch_loss, epoch_val_loss, epoch_sharpe, epoch_val_sharpe))
	
	if config['criteria'] == 'Factor_sharpe':
		if epoch_val_sharpe > best_val_sharpe:
			best_val_sharpe = epoch_val_sharpe
			best_model_state = model.state_dict()
			print("Best model updated")
	
	elif epoch_val_loss < best_val_loss:
		best_val_loss = epoch_val_loss
		best_model_state = model.state_dict()
		print("Best model updated")

exp_path = './Experiments/'
exp_subset_path = os.path.join(exp_path, config['subset'])
if not os.path.exists(exp_subset_path):
	os.makedirs(exp_subset_path)

model.load_state_dict(best_model_state)
time_stamp = datetime.datetime.today().strftime('%Y%m%d_%H%M')
model_save_path = os.path.join(exp_subset_path, 'model' + str(fold+1) + '_' + config['subset'] + '_' + time_stamp + '.pth')
torch.save(model, model_save_path)

duration = time.time() - time_start
print('Training completed in {:.0f}m {:.0f}s'.format(duration // 60, duration % 60))

Epoch 1 - Training Loss: 0.06101711, Val Loss: 0.01324732, Train Sharpe: -0.10112385, Validation Sharpe: 0.05833856
Best model updated
Epoch 2 - Training Loss: 0.00481178, Val Loss: 0.00257534, Train Sharpe: 0.07646226, Validation Sharpe: 0.11010505
Best model updated
Epoch 3 - Training Loss: 0.00124002, Val Loss: 0.00100351, Train Sharpe: -0.03259716, Validation Sharpe: 0.10403872
Epoch 4 - Training Loss: 0.00063398, Val Loss: 0.00061113, Train Sharpe: 0.03984211, Validation Sharpe: 0.03449043


In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(loss_train, label='Training Loss')
plt.plot(loss_val, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(sharpe_train, label='Training Sharpe')
plt.plot(sharpe_val, label='Validation Sharpe')
plt.xlabel('Epochs')
plt.ylabel('Sharpe Ratio')
plt.legend()
plt.show()

In [None]:
def train_one_epoch(model, train_loader, mask, optimizer, criterion, config):
	epoch_loss = torch.tensor(0.0)
	y_train = []
	y_pred_train = []
	
	for i, (X, y) in enumerate(train_loader):
		X, y = X.float().to(config['device']), y.float().to(config['device'])
		optimizer.zero_grad()
		y_pred = model(X)
		
		loss = criterion(y_pred, y)	
		loss.backward()
		optimizer.step()	
		epoch_loss += loss.detach().cpu().item()
		
		y_train.append(y.detach().cpu())
		y_pred_train.append(y_pred.detach().cpu())
	
	epoch_loss /= i + 1
	y_train = torch.cat(y_train, dim=0)
	y_pred_train = torch.cat(y_pred_train, dim=0)
	epoch_sharpe = evaluate_sharpe(y_pred_train, y_train, mask).detach().cpu().item()
	
	return epoch_loss, epoch_sharpe

def validate(model, val_loader, mask, criterion, config):
	# Validation
	with torch.no_grad():
		epoch_val_loss = torch.tensor(0.0)
		y_val = []
		y_pred_val = []
		for i, (X, y) in enumerate(val_loader):
			X, y = X.float().to(config['device']), y.float().to(config['device'])
			y_pred = model(X)
			
			loss = criterion(y_pred, y)
			epoch_val_loss += loss.detach().cpu().item()
			y_val.append(y.detach().cpu())
			y_pred_val.append(y_pred.detach().cpu())
			
	epoch_val_loss /= i + 1
	y_val = torch.cat(y_val, dim=0)
	y_pred_val = torch.cat(y_pred_val, dim=0)
	epoch_val_sharpe = evaluate_sharpe(y_pred_val, y_val, mask).detach().cpu().item()
	
	return epoch_val_loss, epoch_val_sharpe

def training(model, train_loader, val_loader, train_mask, val_mask, config, fold):
	torch.manual_seed(config['random_seed'])
	torch.cuda.manual_seed(config['random_seed'])
	torch.cuda.manual_seed_all(config['random_seed'])
	np.random.seed(config['random_seed'])
	
	optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['reg_l2'])
	criterion = torch.nn.MSELoss()
	best_model_state = model.state_dict()
	best_val_loss = torch.tensor(float('inf'))
	best_val_sharpe = torch.tensor(-float('inf'))
	
	sharpe_train = []
	sharpe_val = []
	loss_train = []
	loss_val = []
	
	time_start = time.time()
	for epoch in range(config['epochs']):
		epoch_loss, epoch_sharpe = train_one_epoch(model, train_loader, train_mask, optimizer, criterion, config)
		loss_train.append(epoch_loss)
		sharpe_train.append(epoch_sharpe)
		
		# Validation
		epoch_val_loss, epoch_val_sharpe = validate(model, val_loader, val_mask, criterion, config)
		loss_val.append(epoch_val_loss)
		sharpe_val.append(epoch_val_sharpe)
		
		if epoch <= 50 or epoch % 10 == 0:
			print('Epoch {} - Training Loss: {:.8f}, Val Loss: {:.8f}, Train Sharpe: {:.8f}, Validation Sharpe: {:.8f}'.format(epoch+1,
																															   epoch_loss,
																															   epoch_val_loss,
																															   epoch_sharpe,
																															   epoch_val_sharpe))

		if epoch_val_sharpe > best_val_sharpe and config['criteria'] == 'Factor_sharpe':
			best_val_sharpe = epoch_val_sharpe
			best_model_state = model.state_dict()
			print("Best model updated")
		
		elif epoch_val_loss < best_val_loss:
			best_val_loss = epoch_val_loss
			best_model_state = model.state_dict()
			print("Best model updated")
	
	exp_path = './Experiments/'
	exp_subset_path = os.path.join(exp_path, config['subset'])
	if not os.path.exists(exp_subset_path):
		os.makedirs(exp_subset_path)
	
	model.load_state_dict(best_model_state)
	time_stamp = datetime.datetime.today().strftime('%Y%m%d_%H%M')
	model_save_path = os.path.join(exp_subset_path, 'model' + str(fold+1) + '_' + config['subset'] + '_' + time_stamp + '.pth')
	torch.save(model, model_save_path)
	
	duration = time.time() - time_start
	print('Training completed in {:.0f}m {:.0f}s'.format(duration // 60, duration % 60))
	return model, loss_train, loss_val, sharpe_train, sharpe_val

In [None]:
crossval_loaders = get_crossval_dataloaders(config['data_path'], config['split_lists'], config['subset'], batch_size=config['batch_size'])

avg_loss = {'train': 0.0, 'val': 0.0, 'test': 0.0}
avg_sharpe = {'train': 0.0, 'val': 0.0, 'test': 0.0}

for fold in range(len(crossval_loaders)):
	train, val, test = crossval_loaders[fold]['datasets']
	train_loader, val_loader, test_loader = crossval_loaders[fold]['dataloaders']
	model = Deep_Network(config).to(device=config['device'])
	model, loss_train, loss_val, sharpe_train, sharpe_val = training(model, train_loader, val_loader, config, fold)
	test_loss, test_sharpe = validate(model, test_loader, torch.nn.MSELoss(), config)
	
	avg_loss['train'] += loss_train[-1]
	avg_loss['val'] += loss_val[-1]
	avg_loss['test'] += test_loss
	
	avg_sharpe['train'] += sharpe_train[-1]
	avg_sharpe['val'] += sharpe_val[-1]
	avg_sharpe['test'] += test_sharpe

avg_loss = {key: value / (fold+1) for key, value in avg_loss.items()}
avg_sharpe = {key: value / (fold+1) for key, value in avg_sharpe.items()}

In [None]:
def run_one_subset(config):
	print('Running subset: {}'.format(config['subset']))
	crossval_loaders = get_crossval_dataloaders(config['data_path'], config['split_lists'], config['subset'], batch_size=config['batch_size'])

	avg_loss = {'train': 0.0, 'val': 0.0, 'test': 0.0}
	avg_sharpe = {'train': 0.0, 'val': 0.0, 'test': 0.0}
	
	for fold in range(len(crossval_loaders)):
		print('Running fold no. {}'.format(fold+1))
		train_loader, val_loader, test_loader = crossval_loaders[fold]['dataloaders']
		train_mask, val_mask, test_mask = masks[fold]
		model = Deep_Network(config).to(device=config['device'])
		model, loss_train, loss_val, sharpe_train, sharpe_val = training(model, train_loader, val_loader, train_mask, val_mask, config, fold)
		test_loss, test_sharpe = validate(model, test_loader, test_mask, torch.nn.MSELoss(), config)
		
		avg_loss['train'] += loss_train[-1]
		avg_loss['val'] += loss_val[-1]
		avg_loss['test'] += test_loss
		
		avg_sharpe['train'] += sharpe_train[-1]
		avg_sharpe['val'] += sharpe_val[-1]
		avg_sharpe['test'] += test_sharpe
	
	avg_loss = {key: value / (fold+1) for key, value in avg_loss.items()}
	avg_sharpe = {key: value / (fold+1) for key, value in avg_sharpe.items()}
	return avg_loss, avg_sharpe

def run_all_subsets(config):
	losses = {}
	sharpes = {}
	for subset in subset2col.keys():
		config['subset'] = subset
		config['input_dim'] = len(subset2col[config['subset']])
		losses[subset], sharpes[subset] = run_one_subset(config)
	return losses, sharpes

In [None]:
if __name__ == "__main__":
	losses, sharpes = run_all_subsets(config)
	print(losses)
	print(sharpes)