In [1]:
import os
import numpy as np
import pandas as pd
#import xgboost as xgb
import matplotlib.pyplot as plt
#from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.base import BaseEstimator
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from contextualized.easy import ContextualizedRegressor

# Time series decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

# Chart drawing
import plotly as py
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# Mute sklearn warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

# Show charts when running kernel
init_notebook_mode(connected=True)

# Change default background color for all visualizations
layout=go.Layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(250,250,250,0.8)')
fig = go.Figure(layout=layout)
templated_fig = pio.to_templated(fig)
pio.templates['my_template'] = templated_fig.layout.template
pio.templates.default = 'my_template'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = '/Users/edocampione/Desktop/Meng Engineering Science/4YP/scripts/dati_trattati.csv' # cleaned data
file_path = '/Users/edocampione/Desktop/Meng Engineering Science/4YP/scripts/dati_with_context.csv' # cleaned data
df = pd.read_csv(file_path)
df['datadate'] = pd.to_datetime(df['datadate'])
df['datadate'] = df['datadate'] + pd.offsets.QuarterEnd(0)
df['divyield_Median'] = df['divyield_Median'].replace('%', '', regex=True).astype(float)

df = df.dropna(subset=['relative_quarterly_return'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11761 entries, 0 to 11812
Data columns (total 96 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   index                           11761 non-null  int64         
 1   gvkey                           11761 non-null  int64         
 2   datadate                        11761 non-null  datetime64[ns]
 3   tic                             11761 non-null  object        
 4   actq                            11761 non-null  float64       
 5   ancq                            11761 non-null  float64       
 6   epsfxq                          11748 non-null  float64       
 7   lctq                            11761 non-null  float64       
 8   ltq                             11761 non-null  float64       
 9   revtq                           11761 non-null  float64       
 10  capxy                           11295 non-null  float64       
 11  prccq  

In [3]:
#drop_features = ['capxy', 'cfm', 'PEG_trailing', 'de_ratio', 'divyield']

# Drop the specified columns
#df = df.drop(columns=drop_features)

df = df.loc[:, df.count() >= 11000]

# Drop rows with missing values
df = df.dropna()

# Display dataframe info
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 9795 entries, 73 to 11812
Data columns (total 90 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   index                           9795 non-null   int64         
 1   gvkey                           9795 non-null   int64         
 2   datadate                        9795 non-null   datetime64[ns]
 3   tic                             9795 non-null   object        
 4   actq                            9795 non-null   float64       
 5   ancq                            9795 non-null   float64       
 6   epsfxq                          9795 non-null   float64       
 7   lctq                            9795 non-null   float64       
 8   ltq                             9795 non-null   float64       
 9   revtq                           9795 non-null   float64       
 10  capxy                           9795 non-null   float64       
 11  prccq  

In [4]:
# Find the unique number of stocks in the dataframe
unique_gvkeys = df['tic'].unique()

# Print the result
print(f"Number of unique gvkeys: {len(unique_gvkeys)}")
print(unique_gvkeys)


Number of unique gvkeys: 76
['F' 'CVX' 'OXY' 'COP' 'XOM' 'BMY' 'ABT' 'PFE' 'MRK' 'JNJ' 'GE' 'MMM'
 'CAT' 'RTX' 'CL' 'PG' 'WBA' 'KO' 'CVS' 'PPL' 'ADM' 'APD' 'SPGI' 'BAX'
 'IBM' 'MCD' 'DIS' 'EXC' 'LMT' 'NOC' 'BA' 'MMC' 'LLY' 'ECL' 'LUV' 'INTC'
 'TMO' 'WDC' 'TSN' 'BRK.B' 'AAPL' 'KMB' 'RJF' 'AMD' 'AFL' 'VZ' 'T' 'UNH'
 'PGR' 'MSFT' 'MS' 'SCHW' 'HCA' 'REGN' 'QCOM' 'SBUX' 'JPM' 'WFC' 'AIG'
 'AMZN' 'AMT' 'EXPE' 'UPS' 'EQIX' 'NFLX' 'CME' 'GOOGL' 'FIS' 'MA' 'V' 'GS'
 'TSLA' 'META' 'ZTS' 'CTSH' 'PYPL']


In [5]:
df = df.dropna()
df = df.sort_values(by=['datadate'])

unique_gvkeys = df['tic'].unique()

# Print the result
print(f"Number of unique gvkeys: {len(unique_gvkeys)}")

df.info()

Number of unique gvkeys: 76
<class 'pandas.core.frame.DataFrame'>
Index: 9795 entries, 73 to 11812
Data columns (total 90 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   index                           9795 non-null   int64         
 1   gvkey                           9795 non-null   int64         
 2   datadate                        9795 non-null   datetime64[ns]
 3   tic                             9795 non-null   object        
 4   actq                            9795 non-null   float64       
 5   ancq                            9795 non-null   float64       
 6   epsfxq                          9795 non-null   float64       
 7   lctq                            9795 non-null   float64       
 8   ltq                             9795 non-null   float64       
 9   revtq                           9795 non-null   float64       
 10  capxy                           9795 non-null  

In [None]:
# List of industries
industries = ['DURBL', 'ENRGY', 'HLTH', 'MANUF', 'NODUR', 'OTHER', 'SHOPS', 'UTILS', 'HITEC', 'TELCM']

# Create binary columns
for industry in industries:
    df[industry] = (df['FFI10_desc'] == industry).astype(int)

In [6]:
tickers_to_drop = ['AMT', 'CTSH', 'GOOGL', 'META', 'PYPL', 'RJF', 'TSLA', 'ZTS']

#df = df[~df['tic'].isin(tickers_to_drop)]

In [7]:
train_start_date = '1970-03-31'
valid_start_date = '2010-03-31'
test_start_date = '2013-03-31'
end_date = '2023-09-30'

df = df.sort_values(by=['datadate'])

train_df = df[(df['datadate'] >= train_start_date) & (df['datadate'] < valid_start_date)]
valid_df = df[(df['datadate'] >= valid_start_date) & (df['datadate'] < test_start_date)]
test_df = df[(df['datadate'] >= test_start_date) & (df['datadate'] < end_date)]

test_df = test_df.sort_values(by=['datadate', 'next_prccq_change'], ascending=[True, False])
test_df.head(10)


Unnamed: 0,index,gvkey,datadate,tic,actq,ancq,epsfxq,lctq,ltq,revtq,...,de_ratio_Median,debt_ebitda_Median,debt_assets_Median,intcov_ratio_Median,cash_ratio_Median,curr_ratio_Median,quick_ratio_Median,at_turn_Median,inv_turn_Median,rect_turn_Median
9757,11660,184996,2013-03-31,TSLA,0.002334,0.04812,-1.0,-0.006464,-0.014447,0.833932,...,0.83,0.762,0.479,7.075,0.469,2.214,1.392,1.212,5.458,6.845
9805,11619,1161,2013-03-31,AMD,0.03223,-0.159078,-0.698413,-0.054402,-0.023108,-0.058009,...,4.466,2.521,0.823,5.217,0.54,1.654,1.523,0.187,2.317,1.304
9781,11666,23812,2013-03-31,REGN,0.083838,0.107948,-0.779412,0.206017,0.041366,0.060448,...,0.502,0.0,0.373,0.737,2.225,3.458,3.022,0.602,2.994,6.4
9820,11663,149070,2013-03-31,CME,0.410638,-0.000707,0.42,0.547441,0.197396,0.087305,...,4.466,2.521,0.823,5.217,0.54,1.654,1.523,0.187,2.317,1.304
9769,11649,11399,2013-03-31,WDC,0.022997,-0.024726,0.176471,-0.030066,-0.031327,-0.01569,...,0.58,0.046,0.376,5.823,1.139,2.45,2.035,0.795,4.363,6.177
9765,11628,12141,2013-03-31,MSFT,0.044098,0.037638,-0.052632,0.000595,0.023348,-0.045069,...,0.58,0.046,0.376,5.823,1.139,2.45,2.035,0.795,4.363,6.177
9821,11667,13988,2013-03-31,SCHW,0.0,0.0,0.0,0.0,-0.004031,0.032761,...,4.466,2.521,0.823,5.217,0.54,1.654,1.523,0.187,2.317,1.304
9785,11615,2285,2013-03-31,BA,0.038057,-0.019945,0.125,0.015206,-0.000506,-0.152856,...,0.954,1.417,0.514,8.063,0.453,2.362,1.458,1.037,4.677,6.859
9784,11613,7985,2013-03-31,NOC,-0.065062,-0.004573,-0.051402,-0.082728,-0.034294,-0.057443,...,0.954,1.417,0.514,8.063,0.453,2.362,1.458,1.037,4.677,6.859
9756,11636,4839,2013-03-31,F,0.0,0.0,0.0,0.0,0.00462,-0.002295,...,0.83,0.762,0.479,7.075,0.469,2.214,1.392,1.212,5.458,6.845


In [8]:
# Create a copy of the test results
results = test_df[['datadate', 'tic', 'next_prccq_change', 'next_relative_quarterly_return']].copy()

# Define columns to drop
drop_cols = ['gvkey', 'datadate', 'tic', 'spindx', 'spindx_change', 'next_spindx_change', 'prccq', 'prccq_change', 'next_prccq_change', 'FFI10_desc']

# Drop the specified columns from train, validation, and test datasets
train_df = train_df.drop(columns=drop_cols)
valid_df = valid_df.drop(columns=drop_cols)
test_df  = test_df.drop(columns=drop_cols)

# Display the first few rows of the training dataset
train_df.head()


Unnamed: 0,index,actq,ancq,epsfxq,lctq,ltq,revtq,capxy,roa,roe,...,de_ratio_Median,debt_ebitda_Median,debt_assets_Median,intcov_ratio_Median,cash_ratio_Median,curr_ratio_Median,quick_ratio_Median,at_turn_Median,inv_turn_Median,rect_turn_Median
73,169,0.0,0.0,-0.558442,0.0,0.0,-0.12818,0.0,0.211,0.089,...,0.828,1.436,0.468,6.364,0.221,2.371,1.157,1.652,3.7,7.404
80,158,0.0,0.0,0.259259,0.0,0.0,0.098502,0.0,0.231,0.113,...,0.562,0.765,0.362,16.601,0.392,2.395,1.52,1.281,2.598,6.677
85,163,0.0,0.0,0.011905,0.0,0.0,0.00758,0.0,0.334,0.137,...,0.777,1.549,0.444,6.242,0.218,2.438,1.239,1.348,3.922,6.718
84,155,0.0,0.0,0.06422,0.0,0.0,-0.008542,0.0,0.144,0.071,...,0.777,1.549,0.444,6.242,0.218,2.438,1.239,1.348,3.922,6.718
82,174,0.0,0.0,0.078947,0.0,0.0,0.0172,0.0,0.25,0.133,...,0.562,0.765,0.362,16.601,0.392,2.395,1.52,1.281,2.598,6.677


In [9]:
# Define target variable (y) and features (X) for train, validation, and test sets
y_train = train_df['next_relative_quarterly_return'].copy()
X_train = train_df.drop(columns=['next_relative_quarterly_return'])

y_valid = valid_df['next_relative_quarterly_return'].copy()
X_valid = valid_df.drop(columns=['next_relative_quarterly_return'])

y_test  = test_df['next_relative_quarterly_return'].copy()
X_test  = test_df.drop(columns=['next_relative_quarterly_return'])

# Display the first few rows of X_train
X_train.head()


Unnamed: 0,index,actq,ancq,epsfxq,lctq,ltq,revtq,capxy,roa,roe,...,de_ratio_Median,debt_ebitda_Median,debt_assets_Median,intcov_ratio_Median,cash_ratio_Median,curr_ratio_Median,quick_ratio_Median,at_turn_Median,inv_turn_Median,rect_turn_Median
73,169,0.0,0.0,-0.558442,0.0,0.0,-0.12818,0.0,0.211,0.089,...,0.828,1.436,0.468,6.364,0.221,2.371,1.157,1.652,3.7,7.404
80,158,0.0,0.0,0.259259,0.0,0.0,0.098502,0.0,0.231,0.113,...,0.562,0.765,0.362,16.601,0.392,2.395,1.52,1.281,2.598,6.677
85,163,0.0,0.0,0.011905,0.0,0.0,0.00758,0.0,0.334,0.137,...,0.777,1.549,0.444,6.242,0.218,2.438,1.239,1.348,3.922,6.718
84,155,0.0,0.0,0.06422,0.0,0.0,-0.008542,0.0,0.144,0.071,...,0.777,1.549,0.444,6.242,0.218,2.438,1.239,1.348,3.922,6.718
82,174,0.0,0.0,0.078947,0.0,0.0,0.0172,0.0,0.25,0.133,...,0.562,0.765,0.362,16.601,0.392,2.395,1.52,1.281,2.598,6.677


In [10]:
# Scaling
scaler = MinMaxScaler()

# Fit the scaler on the training/validation data
X_train_scaled = scaler.fit_transform(X_train)

# Now apply the same scaling to the val/test data using the already fitted scaler
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled data back to DataFrame for consistency
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_valid_scaled = pd.DataFrame(X_valid_scaled, columns=X_valid.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

#X_train_scaled.info()

print(X_train_scaled.columns.tolist())


['index', 'actq', 'ancq', 'epsfxq', 'lctq', 'ltq', 'revtq', 'capxy', 'roa', 'roe', 'pe_inc', 'pcf', 'roce', 'debt_ebitda', 'gpm', 'evm', 'cfm', 'npm', 'de_ratio', 'ptb', 'roa_change', 'roe_change', 'pe_inc_change', 'pcf_change', 'roce_change', 'debt_ebitda_change', 'gpm_change', 'evm_change', 'cfm_change', 'npm_change', 'de_ratio_change', 'ptb_change', 'relative_quarterly_return', 'relative_quarterly_return_-1', 'relative_quarterly_return_-2', 'relative_quarterly_return_-3', 'mktrf', 'smb', 'hml', 'rmw', 'cma', 'rf', 'umd', 'h_1_x', 'h_3_x', 'h_12_x', 'h_1_y', 'h_3_y', 'h_12_y', 'r_f', 'r_mkt', 'r_me', 'r_ia', 'r_roe', 'r_eg', 'indret_ew', 'CAPEI_Median', 'bm_Median', 'evm_Median', 'pe_inc_Median', 'ptb_Median', 'PEG_trailing_Median', 'divyield_Median', 'roa_Median', 'roce_Median', 'roe_Median', 'gpm_Median', 'npm_Median', 'opmad_Median', 'de_ratio_Median', 'debt_ebitda_Median', 'debt_assets_Median', 'intcov_ratio_Median', 'cash_ratio_Median', 'curr_ratio_Median', 'quick_ratio_Median',

In [11]:
# Define the Ridge model
ridge_model = Ridge()

# Define the parameter grid for alpha values
param_grid = {'alpha': [0.0, 0.1, 1.0, 10.0, 100.0, 1000, 2000, 5000, 10000]}

# Perform GridSearchCV to find the best alpha
grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best alpha
best_alpha = grid_search.best_params_['alpha']
print(f"Best alpha: {best_alpha}")

# Train the Ridge model with the best alpha
ridge_model = Ridge(alpha=best_alpha)
ridge_model.fit(X_train_scaled, y_train)

Best alpha: 2000


In [12]:

context = ['relative_quarterly_return_-1', 'relative_quarterly_return_-2', 'relative_quarterly_return_-3', 'mktrf', 'smb', 'hml', 'rmw', 'cma', 'rf', 'umd', 'h_1_x', 'h_3_x', 'h_12_x', 'h_1_y', 'h_3_y', 'h_12_y', 'r_f', 'r_mkt', 'r_me', 'r_ia', 'r_roe', 'r_eg', 'indret_ew', 'DURBL', 'ENRGY', 'HLTH', 'MANUF', 'NODUR', 'OTHER', 'SHOPS', 'UTILS', 'HITEC', 'TELCM', 'CAPEI_Median', 'bm_Median', 'evm_Median', 'pe_inc_Median', 'ptb_Median', 'PEG_trailing_Median', 'divyield_Median', 'roa_Median', 'roce_Median', 'roe_Median', 'gpm_Median', 'npm_Median', 'opmad_Median', 'de_ratio_Median', 'debt_ebitda_Median', 'debt_assets_Median', 'intcov_ratio_Median', 'cash_ratio_Median', 'curr_ratio_Median', 'quick_ratio_Median', 'at_turn_Median', 'inv_turn_Median', 'rect_turn_Median']

C_train = X_train_scaled[context]
C_valid = X_valid_scaled[context]
C_test = X_test_scaled[context]

X_train_context = X_train_scaled.drop(columns=context)
X_valid_context = X_valid_scaled.drop(columns=context)
X_test_context = X_test_scaled.drop(columns=context)


medians = ['relative_quarterly_return_-1', 'relative_quarterly_return_-2', 'relative_quarterly_return_-3', 'mktrf', 'smb', 'hml', 'rmw', 'cma', 'rf', 'umd', 'h_1_x', 'h_3_x', 'h_12_x', 'h_1_y', 'h_3_y', 'h_12_y', 'r_f', 'r_mkt', 'r_me', 'r_ia', 'r_roe', 'r_eg', 'indret_ew']
C_train = C_train[medians].to_numpy()
C_valid = C_valid[medians].to_numpy()
C_test = C_test[medians].to_numpy()


In [13]:
contextualised_model = ContextualizedRegressor()
contextualised_model.fit(C_train, X_train_context.to_numpy(), y_train.to_numpy())



GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


`max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.


Checkpoint directory /Users/edocampione/Desktop/Meng Engineering Science/4YP/models/lightning_logs/boot_0_checkpoints exists and is not empty.


  | Name      | Type           | Params | Mode 
-----------------------------------------------------
0 | metamodel | NaiveMetamodel | 2.8 K  | train
-----------------------------------------------------
2.8 K     Trainable pa

In [None]:
from contextualized.utils import save, load

#save(contextualised_model, path='contextualized_model_without_medians.pt')

In [None]:
y_train_pred1 = ridge_model.predict(X_train_scaled)
y_valid_pred1 = ridge_model.predict(X_valid_scaled)
y_test_pred1 = ridge_model.predict(X_test_scaled)

# Classifier
y_train_pred_binary1 = (y_train_pred1 >= 0).astype(int)
y_valid_pred_binary1 = (y_valid_pred1 >= 0).astype(int)
y_test_pred_binary1 = (y_test_pred1 >= 0).astype(int)

y_train_binary1 = (y_train >= 0).astype(int)
y_valid_binary1 = (y_valid >= 0).astype(int)
y_test_binary1 = (y_test >= 0).astype(int)

print(y_test_pred1)

print(classification_report(y_train_binary1, y_train_pred_binary1))



[0.01792634 0.02033634 0.02203529 ... 0.01764    0.01869512 0.01872461]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      3385
           1       0.53      1.00      0.69      3851

    accuracy                           0.53      7236
   macro avg       0.27      0.50      0.35      7236
weighted avg       0.28      0.53      0.37      7236




Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
y_train_pred2 = contextualised_model.predict(C_train, X_train_context.to_numpy())
y_valid_pred2 = contextualised_model.predict(C_valid, X_valid_context.to_numpy())
y_test_pred2 = contextualised_model.predict(C_test, X_test_context.to_numpy())

# Classifier
y_train_pred_binary2 = (y_train_pred2 >= 0).astype(int)
y_valid_pred_binary2 = (y_valid_pred2 >= 0).astype(int)
y_test_pred_binary2 = (y_test_pred2 >= 0).astype(int)

y_train_binary2 = (y_train >= 0).astype(int)
y_valid_binary2 = (y_valid >= 0).astype(int)
y_test_binary2 = (y_test >= 0).astype(int)

print(y_test_pred2)

print(classification_report(y_train_binary2, y_train_pred_binary2))


The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.



[[ 0.18449655]
 [ 0.13006806]
 [ 0.07930416]
 ...
 [-0.00734259]
 [ 0.0002753 ]
 [-0.00653499]]
              precision    recall  f1-score   support

           0       0.75      0.09      0.16      3385
           1       0.55      0.97      0.70      3851

    accuracy                           0.56      7236
   macro avg       0.65      0.53      0.43      7236
weighted avg       0.64      0.56      0.45      7236



In [None]:
results['y_pred1'] = y_test_pred1
results['y_pred2'] = y_test_pred2
results1 = results.sort_values(by=['datadate', 'y_pred1'], ascending=[True, False])

results1.head(10)



Unnamed: 0,datadate,tic,next_prccq_change,next_relative_quarterly_return,y_pred1,y_pred2
9781,2013-03-31,REGN,0.274808,0.241016,0.022035,0.079304
9780,2013-03-31,BMY,0.084972,0.051179,0.021826,0.048866
9774,2013-03-31,ABT,-0.012458,-0.04625,0.021815,0.064781
9776,2013-03-31,MRK,0.050905,0.017112,0.021768,0.044113
9779,2013-03-31,JNJ,0.053109,0.019317,0.021726,0.026424
9773,2013-03-31,PFE,-0.029453,-0.063245,0.021704,0.018033
9775,2013-03-31,LLY,-0.135059,-0.168852,0.021701,0.021109
9803,2013-03-31,UPS,0.006752,-0.027041,0.020396,0.081142
9805,2013-03-31,AMD,0.6,0.566207,0.020336,0.130068
9815,2013-03-31,EQIX,-0.14604,-0.179833,0.020204,0.034964


In [None]:
def softmax_weighted_return(df, sort_col, target_col, n=5, largest=True):
    if largest:
        selected = df.nlargest(n, sort_col)
    else:
        selected = df.nsmallest(n, sort_col)
    # Compute softmax weights from the predicted scores:
    e_scores = np.exp(selected[sort_col])
    weights = e_scores / e_scores.sum()
    return (selected[target_col] * weights).sum()

def exponential_weighted_return(df, sort_col, target_col, n=10, largest=True, alpha=0.5):
    if largest:
        selected = df.nlargest(n, sort_col)
    else:
        selected = df.nsmallest(n, sort_col)
    selected = selected.sort_values(by=sort_col, ascending=not largest).copy()
    selected['rank'] = range(1, len(selected) + 1)
    # Exponential weights: weight = exp(-alpha * (rank - 1))
    selected['weight'] = np.exp(-alpha * (selected['rank'] - 1))
    selected['weight'] /= selected['weight'].sum()  # Normalize weights
    return (selected[target_col] * selected['weight']).sum()


# Baseline aggregation remains unchanged.
df_avg = results.groupby('datadate').agg(
    stocks=('next_prccq_change', 'size'),
    baseline=('next_prccq_change', 'mean')
).reset_index()

# For the predicted top 10, weight by predicted returns ('pred_rank_score')
df_avg['top_10'] = results.groupby('datadate').apply(
    lambda group: exponential_weighted_return(
        group,
        sort_col='y_pred1',
        target_col='next_prccq_change',
        largest=True
    )
).reset_index(level=0, drop=True)

# For the predicted top 10, weight by predicted returns ('pred_rank_score')
df_avg['top_10_context'] = results.groupby('datadate').apply(
    lambda group: exponential_weighted_return(
        group,
        sort_col='y_pred2',
        target_col='next_prccq_change',
        largest=True
    )
).reset_index(level=0, drop=True)

# For the ideal top 10, weight by the actual returns
df_avg['top_10_ideal'] = results.groupby('datadate').apply(
    lambda group: exponential_weighted_return(
        group,
        sort_col='next_prccq_change',
        target_col='next_prccq_change',
        largest=True
    )
).reset_index(level=0, drop=True)

# For the predicted bottom 10, weight by predicted returns
df_avg['bottom_10'] = results.groupby('datadate').apply(
    lambda group: exponential_weighted_return(
        group,
        sort_col='y_pred1',
        target_col='next_prccq_change',
        largest=False
    )
).reset_index(level=0, drop=True)

# For the predicted bottom 10, weight by predicted returns
df_avg['bottom_10_context'] = results.groupby('datadate').apply(
    lambda group: exponential_weighted_return(
        group,
        sort_col='y_pred2',
        target_col='next_prccq_change',
        largest=False
    )
).reset_index(level=0, drop=True)

# For the ideal bottom 10, weight by the actual returns
df_avg['bottom_10_ideal'] = results.groupby('datadate').apply(
    lambda group: exponential_weighted_return(
        group,
        sort_col='next_prccq_change',
        target_col='next_prccq_change',
        largest=False
    )
).reset_index(level=0, drop=True)

# Display the first 100 rows of the new dataframe
df_avg.head(100)


Unnamed: 0,datadate,stocks,baseline,top_10,top_10_context,top_10_ideal,bottom_10,bottom_10_context,bottom_10_ideal
0,2013-03-31,62,0.083195,0.13498,0.85139,0.959119,-0.01308,-0.02841,-0.123643
1,2013-06-30,61,0.068066,0.153232,0.182901,0.615021,0.030669,-0.023368,-0.093581
2,2013-09-30,63,0.104675,0.017387,0.033898,0.315112,0.018302,0.011406,-0.137854
3,2013-12-31,63,0.034818,0.052452,0.165366,0.302349,0.128621,-0.004003,-0.116466
4,2014-03-31,66,0.032713,0.010857,0.089639,0.218498,0.048261,-0.210925,-0.251419
5,2014-06-30,66,0.025379,0.135587,0.027429,0.228289,-0.021057,-0.019243,-0.141806
6,2014-09-30,65,0.03671,0.124778,0.026018,0.210748,0.002946,-0.081701,-0.197616
7,2014-12-31,65,0.010804,0.080104,-0.028962,0.192479,-0.008586,-0.051373,-0.153323
8,2015-03-31,66,0.016184,0.080255,-0.031329,0.385555,-0.005742,-0.011534,-0.17289
9,2015-06-30,66,-0.057124,-0.099349,-0.051044,0.160524,-0.039848,-0.303417,-0.355343


In [None]:
for portfolio in ['baseline', 'top_10', 'top_10_context', 'top_10_ideal']:

    mean_return = df_avg[portfolio].mean()

    # Calculate the standard deviation of the returns of the top 10 portfolio
    std_return = df_avg[portfolio].std()

    # Assuming the risk-free rate is 0 (if you have a specific risk-free rate, replace 0 with that value)
    risk_free_rate = 0.025

    # Calculate the Sharpe ratio
    sharpe_ratio = (mean_return - risk_free_rate) / std_return

    # Print the Sharpe ratio
    print(f"Mean quarterly return of the {portfolio} portfolio: {mean_return}")
    print(f"Sharpe Ratio of the {portfolio} portfolio: {sharpe_ratio}")
    print("")

Mean quarterly return of the baseline portfolio: 0.028784807352579378
Sharpe Ratio of the baseline portfolio: 0.05933810083202336

Mean quarterly return of the top_10 portfolio: 0.0554728130232285
Sharpe Ratio of the top_10 portfolio: 0.4220632811646713

Mean quarterly return of the top_10_context portfolio: 0.09566776446843303
Sharpe Ratio of the top_10_context portfolio: 0.39749674456107364

Mean quarterly return of the top_10_ideal portfolio: 0.32069126921412167
Sharpe Ratio of the top_10_ideal portfolio: 1.698083179264053



In [None]:
import plotly.graph_objects as go

# Assuming you have a dataframe 'df_avg' with columns 'datadate', 'baseline_portfolio', and 'top_10'

# Calculate the cumulative return for both portfolios
df_avg['baseline_cum_return'] = (1 + df_avg['baseline']).cumprod()
df_avg['top_10_cum_return'] = (1 + df_avg['top_10']).cumprod()
df_avg['top_10_context_cum_return'] = (1 + df_avg['top_10_context']).cumprod()
df_avg['top_10_ideal_cum_return'] = (1 + df_avg['top_10_ideal']).cumprod()

# Calculate the value over time of a $1,000 investment
initial_investment = 1000
df_avg['baseline_value'] = initial_investment * df_avg['baseline_cum_return']
df_avg['top_10_value'] = initial_investment * df_avg['top_10_cum_return']
df_avg['top_10_context_value'] = initial_investment * df_avg['top_10_context_cum_return']
df_avg['top_10_ideal_value'] = initial_investment * df_avg['top_10_ideal_cum_return']

# Create the plot
fig = go.Figure()

# Add baseline portfolio trace
fig.add_trace(go.Scatter(x=df_avg['datadate'], y=df_avg['baseline_value'],
                         mode='lines', name='Baseline Portfolio'))

# Add top_10 portfolio trace
fig.add_trace(go.Scatter(x=df_avg['datadate'], y=df_avg['top_10_value'],
                         mode='lines', name='Top 10 Portfolio'))

fig.add_trace(go.Scatter(x=df_avg['datadate'], y=df_avg['top_10_context_value'],
                         mode='lines', name='Top_10_context Portfolio'))

# Update the layout
fig.update_layout(
    title='Portfolio Value Over Time ($1,000 Investment)',
    xaxis_title='Date',
    yaxis_title='Portfolio Value ($)',
)

# Show the plot
fig.show()
