In [None]:
with open("requirements.txt", "r") as config_file:
    config_code = config_file.read()
    exec(config_code)
%load_ext autoreload
%autoreload 2

In [None]:
tidy_finance = sqlite3.connect(database="data/specialedata.sqlite")

macro_predictors = (pd.read_sql_query(
    sql="SELECT * FROM macro_predictors",
    con=tidy_finance,
    parse_dates={"month"})
 .add_prefix("macro_")
)

JKPFactors = (pd.read_sql_query(
  sql="SELECT * FROM JKPFactors",
  con=tidy_finance,
  parse_dates={"month"})
  .add_prefix("jkp_factor_")
)
JKPFactornames = JKPFactors.columns

factors_ff3_monthly = (pd.read_sql_query(
     sql="SELECT * FROM factors_ff3_monthly",
     con=tidy_finance,
     parse_dates={"month"})
  .add_prefix("factor_ff3_")
)

factors_ff5_monthly = (pd.read_sql_query(
     sql="SELECT * FROM factors_ff5_monthly",
     con=tidy_finance,
     parse_dates={"month"})
  .add_prefix("factor_ff5_")
)

ff_carhart = (pd.read_sql_query(
    sql="SELECT * FROM ff_carhart",
    con=tidy_finance,
    parse_dates={"month"})
 .add_prefix("ff_carhart_")
)
crsp_2000 = (pd.read_sql_query(
    sql="SELECT * FROM crsp_2000",
    con=tidy_finance,
    parse_dates={"month"})
)
crsp_1500 = (pd.read_sql_query(
    sql="SELECT * FROM crsp_1500",
    con=tidy_finance,
    parse_dates={"month"})
)
crsp_1000 = (pd.read_sql_query(
    sql="SELECT * FROM crsp_1000",
    con=tidy_finance,
    parse_dates={"month"})
)
crsp_500 = (pd.read_sql_query(
    sql="SELECT * FROM crsp_500",
    con=tidy_finance,
    parse_dates={"month"})
)
crsp_250 = (pd.read_sql_query(
    sql="SELECT * FROM crsp_250",
    con=tidy_finance,
    parse_dates={"month"})
)

crsp_100 = (pd.read_sql_query(
    sql="SELECT * FROM crsp_100",
    con=tidy_finance,
    parse_dates={"month"})
)

crsp_50 = (pd.read_sql_query(
    sql="SELECT * FROM crsp_50",
    con=tidy_finance,
    parse_dates={"month"})
)

# Select amount of tickers in cross section!
data_total = (crsp_2000
        .merge(factors_ff5_monthly,
               how = "left", left_on = "month", right_on = "factor_ff5_month")
        .assign(ret_excess=lambda x: x["ret"] - x["factor_ff5_rf"]) 
        .drop(columns=['ret', 'factor_ff5_month'])
        .dropna()
       )

# Make a dataframe for stock characteristics and factors
macro_variables = data_total.filter(like="macro").columns
factor_variables = data_total.filter(like="jkp_factor").columns
macro_factors = data_total[macro_variables]
factors = data_total[macro_variables].merge(data_total[factor_variables], left_index=True, right_index=True)
char = data_total[['mktcap', 'mktcap_lag_1', 'mktcap_lag_3', 'mktcap_lag_6', 'mktcap_lag_12', 'mom_1', 'mom_3','mom_6', 'mom_12']]
# List of tickers
tickers = data_total['ticker'].unique()

# Transform data
column_combinations = list(product(macro_factors, char)) 

new_column_values = []
for macro_column, char in column_combinations:
    new_column_values.append(data_total[macro_column] * data_total[char])

column_names = [" x ".join(t) for t in column_combinations]
new_columns = pd.DataFrame(dict(zip(column_names, new_column_values)))

# New data set with added combinations
data = pd.concat([data_total, new_columns], axis=1)

preprocessor = ColumnTransformer(
  transformers=[
    ("scale", StandardScaler(), 
    [col for col in data.columns 
      if col not in ["ret_excess", "month", "ticker"]])
  ],
  remainder="drop",
  verbose_feature_names_out=False
)

training_date = "2017-07-01"

### RF

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# FF 5 factor model
data_ff5 = data[['month','ticker','ret_excess','factor_ff5_mkt_excess','factor_ff5_smb','factor_ff5_hml','factor_ff5_rmw','factor_ff5_cma']]
colnames = ['month','ticker','ret_excess','mkt_excess','smb','hml','rmw','cma']
data_ff5.columns=colnames
# FF 3 factor model
data_ff3 = data_ff5.iloc[:,:6]
# ff3
data_capm = data_ff3.iloc[:,:4]

data_carhart = (data_ff3
        .merge(ff_carhart,
               how = "left", left_on = "month", right_on = "ff_carhart_month")
        .drop(columns=['ff_carhart_mkt_excess','ff_carhart_month','ff_carhart_smb','ff_carhart_hml', 'ff_carhart_rf'])
        .dropna()
       )
data_carhart.columns = ['month','ticker','ret_excess','mkt_excess','smb','hml','mom']


In [None]:
# For CAPM Factor Model 
df_capm = data_capm.copy()
df_capm.set_index('month', inplace=True)
y_capm = df_capm['ret_excess']
X_capm = df_capm.drop(columns=['ret_excess', 'ticker'])

# For FF 3 Factor Model 
df_ff3 = data_ff3.copy()
df_ff3.set_index('month', inplace=True)
y_ff3 = df_ff3['ret_excess']
X_ff3 = df_ff3.drop(columns=['ret_excess', 'ticker'])

# For FF 5 Factor Model 
df_ff5 = data_ff5.copy()
df_ff5.set_index('month', inplace=True)
y_ff5 = df_ff5['ret_excess']
X_ff5 = df_ff5.drop(columns=['ret_excess', 'ticker'])

# For Carhart Model 
df_carhart = data_carhart.copy()
df_carhart.set_index('month', inplace=True)
y_carhart = df_ff5['ret_excess']
X_carhart = df_ff5.drop(columns=['ret_excess', 'ticker'])

def preprocess_data(X, y):
    training_date = "2017-07-01"
    poly = PolynomialFeatures(degree=2)
    scaler = StandardScaler()
    X_scaled_array = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled_array, index=X.index, columns=X.columns)
    X_poly = poly.fit_transform(X_scaled)


    # Splitting data into training and testing sets
    X_train = X_poly[X.index < training_date]
    X_test = X_poly[X.index >= training_date]
    y_train = y[y.index < training_date]
    y_test = y[y.index >= training_date]
    return X_train, X_test, y_train, y_test
    

X_train_capm, X_test_capm, y_train_capm, y_test_capm = preprocess_data(X_capm, y_capm)
X_train_ff3, X_test_ff3, y_train_ff3, y_test_ff3 = preprocess_data(X_ff3, y_ff3)
X_train_ff5, X_test_ff5, y_train_ff5, y_test_ff5 = preprocess_data(X_ff5, y_ff5)
X_train_carhart, X_test_carhart, y_train_carhart, y_test_carhart = preprocess_data(X_carhart, y_carhart)

In [None]:
# capm
num_estimators = [100, 150, 200]
# Create an empty dictionary to store the models
models_capm = {}
predictions_capm = {}

# Loop through the numbers of estimators
for n in num_estimators:
    start = time.time()

    # Define and train the Random Forest model
    model = RandomForestRegressor(n_estimators=n, random_state=42)
    model.fit(X_train_capm, y_train_capm)
    end = time.time()
    print(f'model_{n}: {round((end - start)/60,3)} minutes')

    # Store the trained model in the dictionary
    models_capm[f'model_{n}'] = model
print('------------------')
evaluation_results = {}

# Loop through the models
for model_name, model in models_capm.items():
    # Evaluate the model
    y_pred_capm = model.predict(X_test_capm)
    mse = mean_squared_error(y_test_capm, y_pred_capm)
    r2_capm = r2_score(y_test_capm, y_pred_capm)
    
    # Store the evaluation results in the dictionary
    evaluation_results[model_name] = {'Mean Squared Error': mse, 'R^2 Score': r2_capm}
# Print the evaluation results
for model_name, results in evaluation_results.items():
    print(f"capm Evaluation results for {model_name}:")
    print("Mean Squared Error:", round(results['Mean Squared Error'],4))
    print("R^2 Score:", round(results['R^2 Score'],4))
    print()

In [None]:
# ff3
num_estimators = [100, 150, 200]

# Create an empty dictionary to store the models
models_ff3 = {}
predictions_ff3 = {}

# Loop through the numbers of estimators
for n in num_estimators:
    start = time.time()

    # Define and train the Random Forest model
    model = RandomForestRegressor(n_estimators=n, random_state=42)
    model.fit(X_train_ff3, y_train_ff3)
    end = time.time()
    print(f'model_{n}: {round((end - start)/60,3)} minutes')

    # Store the trained model in the dictionary
    models_ff3[f'model_{n}'] = model
print('------------------')
evaluation_results = {}

# Loop through the models
for model_name, model in models_ff3.items():
    # Evaluate the model
    y_pred_ff3 = model.predict(X_test_ff3)
    mse = mean_squared_error(y_test_ff3, y_pred_ff3)
    r2_ff3 = r2_score(y_test_ff3, y_pred_ff3)
    
    # Store the evaluation results in the dictionary
    evaluation_results[model_name] = {'Mean Squared Error': mse, 'R^2 Score': r2_ff3}
# Print the evaluation results
for model_name, results in evaluation_results.items():
    print(f"ff3 Evaluation results for {model_name}:")
    print("R^2 Score:", round(results['R^2 Score'],4))
    print()



In [None]:
# carhart
num_estimators =  [100, 150, 200]

# Create an empty dictionary to store the models
models_carhart = {}
predictions_carhart = {}

# Loop through the numbers of estimators
for n in num_estimators:
    start = time.time()

    # Define and train the Random Forest model
    model = RandomForestRegressor(n_estimators=n, random_state=42)
    model.fit(X_train_carhart, y_train_carhart)
    end = time.time()
    print(f'model_{n}: {round((end - start)/60,3)} minutes')

    # Store the trained model in the dictionary
    models_carhart[f'model_{n}'] = model
print('------------------')
evaluation_results = {}

# Loop through the models
for model_name, model in models_carhart.items():
    # Evaluate the model
    y_pred_carhart = model.predict(X_test_carhart)
    mse = mean_squared_error(y_test_carhart, y_pred_carhart)
    r2_carhart = r2_score(y_test_carhart, y_pred_carhart)
    
    # Store the evaluation results in the dictionary
    evaluation_results[model_name] = {'Mean Squared Error': mse, 'R^2 Score': r2_carhart}
# Print the evaluation results
for model_name, results in evaluation_results.items():
    print(f"carhart Evaluation results for {model_name}:")
    print("Mean Squared Error:", round(results['Mean Squared Error'],4))
    print("R^2 Score:", round(results['R^2 Score'],4))
    print()



In [None]:
# ff5
num_estimators =  [100, 150, 200]

# Create an empty dictionary to store the models
models_ff5 = {}
predictions_ff5 = {}

# Loop through the numbers of estimators
for n in num_estimators:
    start = time.time()

    # Define and train the Random Forest model
    model = RandomForestRegressor(n_estimators=n, random_state=42)
    model.fit(X_train_ff5, y_train_ff5)
    end = time.time()
    print(f'model_{n}: {round((end - start)/60,3)} minutes')

    # Store the trained model in the dictionary
    models_ff5[f'model_{n}'] = model
print('------------------')
evaluation_results = {}

# Loop through the models
for model_name, model in models_ff5.items():
    # Evaluate the model
    y_pred_ff5 = model.predict(X_test_ff5)
    mse = mean_squared_error(y_test_ff5, y_pred_ff5)
    r2_ff5 = r2_score(y_test_ff5, y_pred_ff5)
    
    # Store the evaluation results in the dictionary
    evaluation_results[model_name] = {'Mean Squared Error': mse, 'R^2 Score': r2_ff5}
# Print the evaluation results
for model_name, results in evaluation_results.items():
    print(f"ff5 Evaluation results for {model_name}:")
    print("Mean Squared Error:", round(results['Mean Squared Error'],4))
    print("R^2 Score:", round(results['R^2 Score'],4))
    print()

### NN

In [None]:
# FF 5 factor model
data_ff5 = data[['month','ticker','ret_excess','factor_ff5_mkt_excess','factor_ff5_smb','factor_ff5_hml','factor_ff5_rmw','factor_ff5_cma']]
colnames = ['month','ticker','ret_excess','mkt_excess','smb','hml','rmw','cma']
data_ff5.columns=colnames
# FF 3 factor model
data_ff3 = data_ff5.iloc[:,:6]
# ff3
data_capm = data_ff3.iloc[:,:4]
# FF 5 factor model
data_ff5 = data[['month','ticker','ret_excess','factor_ff5_mkt_excess','factor_ff5_smb','factor_ff5_hml','factor_ff5_rmw','factor_ff5_cma']]
colnames = ['month','ticker','ret_excess','mkt_excess','smb','hml','rmw','cma']
data_ff5.columns=colnames
# FF 3 factor model
data_ff3 = data_ff5.iloc[:,:6]
# ff3
data_capm = data_ff3.iloc[:,:4]

# For FF 5 Factor Model (data_ff5)
df_capm = data_capm.copy()
df_capm.set_index('month', inplace=True)
y_capm = df_capm['ret_excess']
X_capm = df_capm.drop(columns=['ret_excess', 'ticker'])

# For FF 3 Factor Model (data_ff3)
df_ff3 = data_ff3.copy()
df_ff3.set_index('month', inplace=True)
y_ff3 = df_ff3['ret_excess']
X_ff3 = df_ff3.drop(columns=['ret_excess', 'ticker'])

# For ff3 (data_ff3)
df_ff5 = data_ff3.copy()
df_ff5.set_index('month', inplace=True)
y_ff5 = df_ff3['ret_excess']
X_ff5 = df_ff3.drop(columns=['ret_excess', 'ticker'])
X_train_capm, X_test_capm, y_train_capm, y_test_capm = preprocess_data(X_capm, y_capm)
X_train_ff3, X_test_ff3, y_train_ff3, y_test_ff3 = preprocess_data(X_ff3, y_ff3)
X_train_ff5, X_test_ff5, y_train_ff5, y_test_ff5 = preprocess_data(X_ff5, y_ff5)

In [None]:
def create_neural_network(layers, neurons, n_features):
    model = keras.Sequential()
    model.add(keras.layers.Dense(neurons[0], activation='sigmoid', input_shape=(n_features,), name='input_layer'))
    
    for i in range(layers):
        model.add(keras.layers.Dense(neurons[i], activation='sigmoid', name=f'hidden_layer_{i+1}'))
    
    model.add(keras.layers.Dense(1, name='output_layer'))  # Output layer
    
    return model

In [None]:
n_features_capm = X_train_capm.shape[1]
n_features_ff3 = X_train_ff3.shape[1]
n_features_ff5 = X_train_ff5.shape[1]
n_features_carhart = X_train_carhart.shape[1]

In [None]:
#CAPM
model_4_capm = create_neural_network(4, [2,2,2,1], n_features_capm)
model_4_capm.compile(optimizer='adam', loss='mean_squared_error')

model_5_capm = create_neural_network(5, [2,2,2,2,1], n_features_capm)
model_5_capm.compile(optimizer='adam', loss='mean_squared_error')

model_6_capm = create_neural_network(5, [2,2,2,2,2,1], n_features_capm)
model_6_capm.compile(optimizer='adam', loss='mean_squared_error')

#FF3
model_4_ff3 = create_neural_network(4, [2,2,2,1], n_features_ff3)
model_4_ff3.compile(optimizer='adam', loss='mean_squared_error')

model_5_ff3 = create_neural_network(5, [2,2,2,2,1], n_features_ff3)
model_5_ff3.compile(optimizer='adam', loss='mean_squared_error')

model_6_ff3 = create_neural_network(5, [2,2,2,2,2,1], n_features_ff3)
model_6_ff3.compile(optimizer='adam', loss='mean_squared_error')

#FF5
model_4_ff5 = create_neural_network(4, [2,2,2,1], n_features_ff5)
model_4_ff5.compile(optimizer='adam', loss='mean_squared_error')

model_5_ff5 = create_neural_network(5, [4,2,2,2,1], n_features_ff5)
model_5_ff5.compile(optimizer='adam', loss='mean_squared_error')

model_6_ff5 = create_neural_network(5, [4,2,2,2,2,1], n_features_ff5)
model_6_ff5.compile(optimizer='adam', loss='mean_squared_error')

#Carhart
model_4_carhart = create_neural_network(4, [2,2,2,1], n_features_carhart)
model_4_carhart.compile(optimizer='adam', loss='mean_squared_error')

model_5_carhart = create_neural_network(5, [2,2,2,2,1], n_features_carhart)
model_5_carhart.compile(optimizer='adam', loss='mean_squared_error')

model_6_carhart = create_neural_network(5, [2,2,2,2,2,1], n_features_carhart)
model_6_carhart.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
def NN(model, X_train, y_train, X_test, y_test):
    start = time.time()
    history = model.fit(X_train, y_train, epochs=50, verbose = 0,  batch_size=32, validation_split=0.2)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate R^2 score and MSPE
    r2 = r2_score(y_test, y_pred)
    mspe = mean_squared_error(y_test, y_pred)
    print(" Out-of-sample R^2:", round(r2,4))
    #print("Mean Squared Prediction Error (MSPE):", round(mspe,4))
    
    end = time.time()
    print(f'{round(end - start,3)/60} minutes')
models_capm = [model_4_capm,model_5_capm,model_6_capm]
models_ff3 = [model_4_ff3,model_5_ff3,model_6_ff3]
models_ff5 = [model_4_ff5,model_5_ff5,model_6_ff5]
models_carhart = [model_4_carhart,model_5_carhart,model_6_carhart]

In [None]:
# capm
for model in models_capm:
    NN(model, X_train_capm, y_train_capm, X_test_capm, y_test_capm)

In [None]:
# FF3
for model in models_ff3:
    NN(model, X_train_ff3, y_train_ff3, X_test_ff3, y_test_ff3)

In [None]:
# carhart
for model in models_carhart:
    NN(model, X_train_carhart, y_train_carhart, X_test_carhart, y_test_carhart)

In [None]:
# ff5
for model in models_ff5:
    NN(model, X_train_ff5, y_train_ff5, X_test_ff5, y_test_ff5)