# Value at risk - market factors

In this notebook, we show how to
- train a linear regression model using historical market factor data for each instrument in portfolio
- ensure our market factors are not correlated and exhibit a normal distribution of market returns
- register all models to mlflow
- explore the use of pyfunc format

This notebook requires the following dependencies
- `mlflow`

# `STEP0` Configuration

In [None]:
%matplotlib inline

import pandas as pd
import math
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import mlflow
import mlflow.sklearn
import pandas as pd
import matplotlib.pyplot as plt

from numpy import savetxt

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes

from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType
from datetime import datetime, timedelta

In [None]:
portfolio_table = 'var_portfolio'
stock_table = 'var_stock'
stock_return_table = 'var_stock_return'
market_table = 'var_market'
market_return_table = 'var_market_return'
trial_table = 'var_monte_carlo'

# when do we train model
today_str = "2019-06-01"
today = F.to_date(F.lit(today_str)).cast(TimestampType())
mlflow.log_param('to_date', today_str)

# our predictive market factors
feature_names = ['SP500', 'NYSE', 'OIL', 'TREASURY', 'DOWJONES']

# `STEP1` Access data

In [None]:
versions_m_df = sql("DESCRIBE HISTORY " + market_return_table).select("version")
delta_m_version = versions_m_df.toPandas()['version'].max()

versions_s_df = sql("DESCRIBE HISTORY " + stock_return_table).select("version")
delta_s_version = versions_s_df.toPandas()['version'].max()

In [None]:
# retrieve historical tick data up to specified date
f_ret = spark.table(market_return_table).filter(F.col('date') <= today)
s_ret = spark.table(stock_return_table).filter(F.col('date') <= today)

# market factors easily fit in memory and are required to build normal distribution of returns
f_ret_pdf = f_ret.toPandas()
f_ret_pdf.index = f_ret_pdf['date']
f_ret_pdf = f_ret_pdf.drop(['date'], axis=1)
mlflow.log_metric('x_size', f_ret_pdf.size)
f_ret_pdf.head(10)



#`STEP2` Evaluate market factors

In [None]:
# we simply plot correlation matrix via pandas (market factors fit in memory)
# we assume market factors are not correlated (NASDAQ and SP500 are, so are OIL and TREASURY BONDS)
f_cor_pdf = f_ret_pdf.corr(method='spearman', min_periods=12)
sns.set(rc={'figure.figsize':(11,8)})
sns.heatmap(f_cor_pdf, annot=True)
plt.savefig('/tmp/factor_correlation.png')
plt.show()



#`STEP3` Train a model for each instrument

In [None]:
# create our feature set based on market factors and actual portfolio return
# in real life, we should obviously split set into training / testing
x_train = f_ret \
  .withColumn("features", F.array(feature_names)) \
  .dropna() \
  .select('date', 'features') \
  .join(s_ret, 'date')

display(x_train)



In [None]:
# add non linear transformations as simple example on non linear returns
def featurize(xs):
  fs = []
  for x in xs:
    fs.append(x)
    fs.append(np.sign(x) * x**2)
    fs.append(x**3)
    fs.append(np.sign(x) * np.sqrt(abs(x)))
  return fs

# use pandas UDF to train multiple model (one for each instrument) in parallel
# the resulting dataframe will be the linear regression weights for each instrument
schema = StructType([StructField('ticker', StringType(), True), StructField('weights', ArrayType(FloatType()), True)])
@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def train_model(pdf):
  # build market factor vectors
  # add a constant - the intercept term for each instrument i.
  X = [featurize(row) for row in np.array(pdf['features'])]
  X = sm.add_constant(X, prepend=True)
  y = np.array(pdf['return'])
  model = sm.OLS(y, X).fit()
  w_df = pd.DataFrame(data=[[model.params]], columns=['weights'])
  w_df['ticker'] = pdf['ticker'].iloc(0)[0]
  return w_df

# the resulting dataframe easily fits in memory and will be saved as our "uber model", serialized to json
models_df = x_train.groupBy('ticker').apply(train_model).toPandas()
models_df.to_json("/tmp/models.json")



In [None]:
# simply applying weight to each market factor feature
@udf("float")
def predict_udf(xs, ps):
  fs = featurize(xs)
  s = ps[0]
  for i, f in enumerate(fs):
    s = s + ps[i + 1] * f
  return float(s)

# we read models created at previous step
models_df = spark.createDataFrame(pd.read_json("/tmp/models.json"))

# we join model for each return to compute prediction of return vs. actual
prediction_df = x_train.join(models_df, ['ticker']) \
  .withColumn("predicted", predict_udf(F.col('features'), F.col('weights'))) \
  .withColumnRenamed('return', 'actual') \
  .select('ticker', 'date', 'predicted', 'actual')



In [None]:
@udf("float")
def wsse_udf(p, a):
  return float((p - a)**2)

# compare expected vs. actual return
# sum mean square error per instrument
wsse_df = prediction_df \
  .withColumn('wsse', wsse_udf(F.col('predicted'), F.col('actual'))) \
  .groupBy('ticker') \
  .agg(F.sum('wsse')) \
  .toPandas()

# plot mean square error as accuracy of our model for each instrument
ax=wsse_df.plot.bar(x='ticker', y='sum(wsse)', rot=0, label=None, figsize=(24,5))
ax.get_legend().remove()
plt.title("Model WSSE for each instrument")
plt.xticks(rotation=45)
plt.ylabel("wsse")
plt.savefig("/tmp/model_wsse.png")
plt.show()



In [None]:
df = prediction_df.filter(F.col('ticker') == "EC").toPandas()
plt.figure(figsize=(20,8))
plt.plot(df.date, df.actual)
plt.plot(df.date, df.predicted, color='green', linestyle='--')
plt.title('Log return of EC')
plt.ylabel('log return')
plt.xlabel('date')
plt.show()



# `STEP4` register model

In [None]:
mlflow.log_param('delta.version.market', delta_m_version)
mlflow.log_param('delta.version.stocks', delta_s_version)
mlflow.log_artifact('/tmp/model_wsse.png')
mlflow.log_artifact('/tmp/factor_correlation.png')
mlflow.log_artifact('/tmp/models.json')
mlflow.end_run()



# `HOMEWORK` package model
We show how any function or model can be easily wrapped as a `mlflow.pyfunc` model and registered as such on ml registry. Real life VAR models are obviously more complex than a simple linear regression described here and are not necessarily out of the box sklearn or keras models. Still, they should follow same ML development standard and can easily benefit from ml-flow functionalities as long as one can express model I/O as a form of `pd.Series`, `pd.DataFrame` or `np.array`

In [None]:
import mlflow.pyfunc

class ModelRisk(mlflow.pyfunc.PythonModel):

  def load_context(self, context):
    # can load data from ml-flow context['data']
    print('TODO')

  def predict(self, context, input):
    # input can be a pd.Series, pd.DataFrame, np.array
    print('TODO')

artifacts = {"data": '/tmp/models.json'}
model_risk = ModelRisk()
mlflow.pyfunc.log_model(artifact_path = 'model', artifacts = artifacts, python_model=model_risk)
mlflow.end_run()

