# Day One Notebook

In [3]:
import numpy as np
import pandas as pd

### Data Loading and Processing

In [5]:
df = pd.read_csv('../data/train.csv')
df.info()
df.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8990 entries, 0 to 8989
Data columns (total 98 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   date_id                        8990 non-null   int64  
 1   D1                             8990 non-null   int64  
 2   D2                             8990 non-null   int64  
 3   D3                             8990 non-null   int64  
 4   D4                             8990 non-null   int64  
 5   D5                             8990 non-null   int64  
 6   D6                             8990 non-null   int64  
 7   D7                             8990 non-null   int64  
 8   D8                             8990 non-null   int64  
 9   D9                             8990 non-null   int64  
 10  E1                             7206 non-null   float64
 11  E10                            7984 non-null   float64
 12  E11                            7984 non-null   f

Unnamed: 0,date_id,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,V3,V4,V5,V6,V7,V8,V9,forward_returns,risk_free_rate,market_forward_excess_returns
8985,8985,0,0,0,0,0,0,0,0,0,...,0.469577,0.837963,1.226772,0.822751,-0.707361,0.142857,-0.649616,0.002457,0.000155,0.00199
8986,8986,0,0,0,0,0,0,0,0,0,...,0.671958,0.837963,0.785877,0.805556,-0.715692,0.196098,-0.668289,0.002312,0.000156,0.001845
8987,8987,0,0,1,0,0,0,0,0,0,...,0.481481,0.787698,0.834898,0.823413,-0.723949,0.133929,-0.670946,0.002891,0.000156,0.002424
8988,8988,0,0,0,0,0,0,0,0,0,...,0.655423,0.78373,0.994026,0.851852,-0.684937,0.101852,-0.646265,0.00831,0.000156,0.007843
8989,8989,0,0,0,0,0,0,0,0,0,...,0.066799,0.78373,1.068037,0.87963,-0.764806,0.079034,-0.705662,9.9e-05,0.000156,-0.000368


In [6]:
# Create lagged features
df['forward_returns_lag1'] = df['forward_returns'].shift(1)
df['risk_free_rate_lag1'] = df['risk_free_rate'].shift(1)
df['market_forward_excess_returns_lag1'] = df['market_forward_excess_returns'].shift(1)
df = df.drop(['risk_free_rate', 'market_forward_excess_returns'], axis=1)

df.tail()

Unnamed: 0,date_id,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,V4,V5,V6,V7,V8,V9,forward_returns,forward_returns_lag1,risk_free_rate_lag1,market_forward_excess_returns_lag1
8985,8985,0,0,0,0,0,0,0,0,0,...,0.837963,1.226772,0.822751,-0.707361,0.142857,-0.649616,0.002457,-0.002896,0.000159,-0.003365
8986,8986,0,0,0,0,0,0,0,0,0,...,0.837963,0.785877,0.805556,-0.715692,0.196098,-0.668289,0.002312,0.002457,0.000155,0.00199
8987,8987,0,0,1,0,0,0,0,0,0,...,0.787698,0.834898,0.823413,-0.723949,0.133929,-0.670946,0.002891,0.002312,0.000156,0.001845
8988,8988,0,0,0,0,0,0,0,0,0,...,0.78373,0.994026,0.851852,-0.684937,0.101852,-0.646265,0.00831,0.002891,0.000156,0.002424
8989,8989,0,0,0,0,0,0,0,0,0,...,0.78373,1.068037,0.87963,-0.764806,0.079034,-0.705662,9.9e-05,0.00831,0.000156,0.007843


In [7]:
# Create feature tensor X and target tensor y
import torch

# Define features (exclude target and identifier columns)
feature_cols = [col for col in df.columns if col not in ['forward_returns', 'date_id']]
target_col = 'forward_returns'

# Create tensors
X = torch.tensor(df[feature_cols].values, dtype=torch.float32)
y = torch.tensor(df[target_col].values, dtype=torch.float32)

print(f"Feature tensor X shape: {X.shape}")
print(f"Target tensor y shape: {y.shape}")
print(f"Number of features: {len(feature_cols)}")
print(f"Number of samples: {len(df)}")

print(f"\nFirst 5 feature names:")
print(feature_cols[:5])
print(f"Last 5 feature names:")
print(feature_cols[-5:])

Feature tensor X shape: torch.Size([8990, 97])
Target tensor y shape: torch.Size([8990])
Number of features: 97
Number of samples: 8990

First 5 feature names:
['D1', 'D2', 'D3', 'D4', 'D5']
Last 5 feature names:
['V8', 'V9', 'forward_returns_lag1', 'risk_free_rate_lag1', 'market_forward_excess_returns_lag1']


In [12]:
# Import the custom loss function
import sys
sys.path.append('../')
from loss_fnqs.loss_functions import vol_penalized_sharpe
loss = vol_penalized_sharpe

### Normal equations and ARIMA for baseline

In [9]:
# Normal Equations Implementation
# Convert tensors to numpy for normal equations
X_np = X.numpy()
y_np = y.numpy()

# Handle missing values by using only complete cases
mask = ~np.isnan(X_np).any(axis=1) & ~np.isnan(y_np)
X_clean = X_np[mask]
y_clean = y_np[mask]

print(f"Original samples: {len(X_np)}")
print(f"Clean samples (no NaN): {len(X_clean)}")

# Add bias term (intercept)
X_with_bias = np.column_stack([np.ones(len(X_clean)), X_clean])



# Normal equations: β = (X^T X)^(-1) X^T y
XTX = X_with_bias.T @ X_with_bias
XTy = X_with_bias.T @ y_clean

# Check if matrix is invertible
print(f"Matrix condition number: {np.linalg.cond(XTX):.2e}")

# Use pseudo-inverse for numerical stability
beta = np.linalg.pinv(XTX) @ XTy

print(f"Coefficients shape: {beta.shape}")
print(f"Intercept: {beta[0]:.6f}")
print(f"First 5 feature coefficients: {beta[1:6]}")

# Calculate R-squared
y_pred = X_with_bias @ beta
ss_res = np.sum((y_clean - y_pred) ** 2)
ss_tot = np.sum((y_clean - np.mean(y_clean)) ** 2)
r_squared = 1 - (ss_res / ss_tot)

print(f"R-squared: {r_squared:.6f}")
print(f"RMSE: {np.sqrt(ss_res / len(y_clean)):.6f}")

Original samples: 8990
Clean samples (no NaN): 2021
Matrix condition number: 4.03e+17
Coefficients shape: (98,)
Intercept: 0.037848
First 5 feature coefficients: [ 3.60388201e-04  3.59137735e-04  1.35124485e-04 -6.80661350e-05
 -1.38620007e-03]
R-squared: 0.094985
RMSE: 0.010272
