In [126]:
import os
import numpy as np
import pandas as pd
#import xgboost as xgb
import matplotlib.pyplot as plt
#from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.base import BaseEstimator
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from contextualized.easy import ContextualizedRegressor

# Time series decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

# Chart drawing
import plotly as py
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# Mute sklearn warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

# Show charts when running kernel
init_notebook_mode(connected=True)

# Change default background color for all visualizations
layout=go.Layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(250,250,250,0.8)')
fig = go.Figure(layout=layout)
templated_fig = pio.to_templated(fig)
pio.templates['my_template'] = templated_fig.layout.template
pio.templates.default = 'my_template'

In [127]:
file_path = '/Users/edocampione/Desktop/Meng Engineering Science/4YP/scripts/dati_trattati.csv' # cleaned data
df = pd.read_csv(file_path)
df['datadate'] = pd.to_datetime(df['datadate'])
df['datadate'] = df['datadate'] + pd.offsets.QuarterEnd(0)

df = df.dropna(subset=['relative_quarterly_return'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14322 entries, 1 to 15338
Data columns (total 48 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   gvkey                           14322 non-null  int64         
 1   datadate                        14322 non-null  datetime64[ns]
 2   tic                             14322 non-null  object        
 3   actq                            14322 non-null  float64       
 4   ancq                            14322 non-null  float64       
 5   epsfxq                          14302 non-null  float64       
 6   lctq                            14322 non-null  float64       
 7   ltq                             14322 non-null  float64       
 8   revtq                           14322 non-null  float64       
 9   capxy                           13783 non-null  float64       
 10  roa                             14289 non-null  float64       
 11  roe    

In [128]:
drop_features = ['capxy', 'cfm', 'PEG_trailing', 'de_ratio', 'divyield']

# Drop the specified columns
df = df.drop(columns=drop_features)

# Drop rows with missing values
df = df.dropna()

# Display dataframe info
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 11212 entries, 32 to 15337
Data columns (total 43 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   gvkey                           11212 non-null  int64         
 1   datadate                        11212 non-null  datetime64[ns]
 2   tic                             11212 non-null  object        
 3   actq                            11212 non-null  float64       
 4   ancq                            11212 non-null  float64       
 5   epsfxq                          11212 non-null  float64       
 6   lctq                            11212 non-null  float64       
 7   ltq                             11212 non-null  float64       
 8   revtq                           11212 non-null  float64       
 9   roa                             11212 non-null  float64       
 10  roe                             11212 non-null  float64       
 11  pe_inc

In [129]:
# Find the unique number of stocks in the dataframe
unique_gvkeys = df['tic'].unique()

# Print the result
print(f"Number of unique gvkeys: {len(unique_gvkeys)}")
print(unique_gvkeys)


Number of unique gvkeys: 80
['AAPL' 'ABT' 'ADBE' 'ADM' 'AMD' 'AMGN' 'AMT' 'AMZN' 'APD' 'BA' 'BAX'
 'BMY' 'CAT' 'CL' 'CME' 'COP' 'COST' 'CSCO' 'CTSH' 'CVS' 'CVX' 'DIS' 'ECL'
 'EQIX' 'EXC' 'EXPE' 'F' 'FDX' 'FIS' 'GE' 'GOOGL' 'HCA' 'HD' 'IBM' 'INTC'
 'JBL' 'JNJ' 'KMB' 'KO' 'LLY' 'LMT' 'LUV' 'MA' 'MCD' 'META' 'MMC' 'MMM'
 'MRK' 'MSFT' 'MU' 'NFLX' 'NKE' 'NOC' 'NVDA' 'OXY' 'PFE' 'PG' 'PPL' 'PYPL'
 'QCOM' 'REGN' 'RJF' 'RTX' 'SBUX' 'SPGI' 'STZ' 'T' 'TGT' 'TMO' 'TSLA'
 'TSN' 'UNH' 'UPS' 'V' 'VZ' 'WBA' 'WDC' 'WMT' 'XOM' 'ZTS']


In [130]:
df = df.dropna()
df = df.sort_values(by=['datadate'])

unique_gvkeys = df['tic'].unique()

# Print the result
print(f"Number of unique gvkeys: {len(unique_gvkeys)}")

df.info()

Number of unique gvkeys: 80
<class 'pandas.core.frame.DataFrame'>
Index: 11212 entries, 172 to 3737
Data columns (total 43 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   gvkey                           11212 non-null  int64         
 1   datadate                        11212 non-null  datetime64[ns]
 2   tic                             11212 non-null  object        
 3   actq                            11212 non-null  float64       
 4   ancq                            11212 non-null  float64       
 5   epsfxq                          11212 non-null  float64       
 6   lctq                            11212 non-null  float64       
 7   ltq                             11212 non-null  float64       
 8   revtq                           11212 non-null  float64       
 9   roa                             11212 non-null  float64       
 10  roe                             11212 non-null

In [131]:
tickers_to_drop = ['AMT', 'CTSH', 'GOOGL', 'META', 'PYPL', 'RJF', 'TSLA', 'ZTS']

df = df[~df['tic'].isin(tickers_to_drop)]

In [132]:
train_start_date = '1970-03-31'
valid_start_date = '2010-03-31'
test_start_date = '2013-03-31'
end_date = '2023-09-30'

df = df.sort_values(by=['datadate'])

train_df = df[(df['datadate'] >= train_start_date) & (df['datadate'] < valid_start_date)]
valid_df = df[(df['datadate'] >= valid_start_date) & (df['datadate'] < test_start_date)]
test_df = df[(df['datadate'] >= test_start_date) & (df['datadate'] < end_date)]

test_df = test_df.sort_values(by=['datadate', 'next_prccq_change'], ascending=[True, False])
test_df.head(10)


Unnamed: 0,gvkey,datadate,tic,actq,ancq,epsfxq,lctq,ltq,revtq,roa,...,curr_ratio_change,ptb_change,relative_quarterly_return,next_relative_quarterly_return,spindx,spindx_change,next_spindx_change,prccq,prccq_change,next_prccq_change
1253,1161,2013-03-31,AMD,0.03223,-0.159078,-0.698413,-0.054402,-0.023108,-0.058009,0.023,...,-0.00802,1.083812,-0.032845,0.566207,1562.17,0.095345,0.033793,2.55,0.0625,0.6
10115,7343,2013-03-31,MU,0.009219,-0.023309,0.037037,-0.009822,0.012073,0.133043,0.107,...,-0.014717,0.297872,0.33447,0.318003,1514.68,0.069553,0.076623,8.375,0.404023,0.394627
11978,23812,2013-03-31,REGN,0.083838,0.107948,-0.779412,0.206017,0.041366,0.060448,0.383,...,-0.205994,0.289679,-0.064171,0.241016,1562.17,0.095345,0.033793,176.403,0.031174,0.274808
3228,149070,2013-03-31,CME,0.410638,-0.000707,0.42,0.547441,0.197396,0.087305,0.049,...,-0.004367,0.134783,0.116615,0.202977,1562.17,0.095345,0.033793,61.41,0.21196,0.236769
14621,11399,2013-03-31,WDC,0.022997,-0.024726,0.176471,-0.030066,-0.031327,-0.01569,0.266,...,0.087776,0.291424,0.088228,0.200846,1562.17,0.095345,0.033793,50.29,0.183573,0.234639
9959,12141,2013-03-31,MSFT,0.044098,0.037638,-0.052632,0.000595,0.023348,-0.045069,0.256,...,0.002184,0.186091,-0.024386,0.173863,1562.17,0.095345,0.033793,28.605,0.070959,0.207656
13004,2710,2013-03-31,STZ,-0.003789,-0.115575,-0.258621,-0.139502,-0.138732,-0.092581,0.091,...,0.359567,0.514735,0.163446,0.121613,1514.68,0.069553,0.076623,44.24,0.232999,0.198237
2121,2285,2013-03-31,BA,0.038057,-0.019945,0.125,0.015206,-0.000506,-0.152856,0.094,...,-0.008634,0.015841,0.043854,0.159451,1562.17,0.095345,0.033793,85.85,0.139199,0.193244
10580,7985,2013-03-31,NOC,-0.065062,-0.004573,-0.051402,-0.082728,-0.034294,-0.057443,0.139,...,0.002886,0.249847,-0.057316,0.146535,1562.17,0.095345,0.033793,70.15,0.038029,0.180328
13375,3813,2013-03-31,TGT,-0.113923,0.02223,0.53125,-0.106818,-0.048816,0.34243,0.155,...,-0.026667,0.125,-0.113256,0.101628,1498.11,0.060864,0.06639,60.41,-0.052392,0.168019


In [133]:
# Create a copy of the test results
results = test_df[['datadate', 'tic', 'next_prccq_change', 'next_relative_quarterly_return']].copy()

# Define columns to drop
drop_cols = ['gvkey', 'datadate', 'tic', 'spindx', 'spindx_change', 'next_spindx_change', 'prccq', 'prccq_change', 'next_prccq_change']

# Drop the specified columns from train, validation, and test datasets
train_df = train_df.drop(columns=drop_cols)
valid_df = valid_df.drop(columns=drop_cols)
test_df  = test_df.drop(columns=drop_cols)

# Display the first few rows of the training dataset
train_df.head()


Unnamed: 0,actq,ancq,epsfxq,lctq,ltq,revtq,roa,roe,pe_inc,pcf,...,gpm_change,evm_change,cfm_change,npm_change,PEG_trailing_change,de_ratio_change,curr_ratio_change,ptb_change,relative_quarterly_return,next_relative_quarterly_return
172,0.0,0.0,-0.115385,0.0,0.0,-0.016251,0.224,0.556,25.408,34.228,...,0.0,0.0,0.0,0.0,-0.295597,0.0,0.0,-0.177037,-0.016088,0.028255
15082,0.0,0.0,-0.013423,0.0,0.0,0.036061,0.17,0.032,11.749,6.02,...,0.0,-0.028041,-0.106383,-0.108434,0.237403,0.0,0.0,0.050788,-0.046479,0.144996
5213,0.0,0.0,-0.135338,0.0,0.0,-0.172254,0.211,0.034,9.079,3.284,...,0.0,0.354212,-0.326087,-0.054054,0.148148,0.0,0.0,-0.005464,0.138857,0.147681
1734,0.0,0.0,0.181818,0.0,0.0,0.089057,0.146,0.097,15.323,14.417,...,0.0,0.0,0.0,-0.016949,-0.363201,0.0,0.0,-0.095171,0.130826,0.071186
4912,0.0,0.0,0.304444,0.0,0.0,0.135055,0.112,0.11,11.028,5.864,...,0.0,0.0,-0.011628,-0.013699,0.0,0.0,0.0,-0.114448,0.036812,-0.002057


In [134]:
# Define target variable (y) and features (X) for train, validation, and test sets
y_train = train_df['next_relative_quarterly_return'].copy()
X_train = train_df.drop(columns=['next_relative_quarterly_return'])

y_valid = valid_df['next_relative_quarterly_return'].copy()
X_valid = valid_df.drop(columns=['next_relative_quarterly_return'])

y_test  = test_df['next_relative_quarterly_return'].copy()
X_test  = test_df.drop(columns=['next_relative_quarterly_return'])

# Display the first few rows of X_train
X_train.head()


Unnamed: 0,actq,ancq,epsfxq,lctq,ltq,revtq,roa,roe,pe_inc,pcf,...,debt_ebitda_change,gpm_change,evm_change,cfm_change,npm_change,PEG_trailing_change,de_ratio_change,curr_ratio_change,ptb_change,relative_quarterly_return
172,0.0,0.0,-0.115385,0.0,0.0,-0.016251,0.224,0.556,25.408,34.228,...,0.0,0.0,0.0,0.0,0.0,-0.295597,0.0,0.0,-0.177037,-0.016088
15082,0.0,0.0,-0.013423,0.0,0.0,0.036061,0.17,0.032,11.749,6.02,...,-0.000895,0.0,-0.028041,-0.106383,-0.108434,0.237403,0.0,0.0,0.050788,-0.046479
5213,0.0,0.0,-0.135338,0.0,0.0,-0.172254,0.211,0.034,9.079,3.284,...,0.371212,0.0,0.354212,-0.326087,-0.054054,0.148148,0.0,0.0,-0.005464,0.138857
1734,0.0,0.0,0.181818,0.0,0.0,0.089057,0.146,0.097,15.323,14.417,...,0.0,0.0,0.0,0.0,-0.016949,-0.363201,0.0,0.0,-0.095171,0.130826
4912,0.0,0.0,0.304444,0.0,0.0,0.135055,0.112,0.11,11.028,5.864,...,0.0,0.0,0.0,-0.011628,-0.013699,0.0,0.0,0.0,-0.114448,0.036812


In [135]:
# Scaling
scaler = MinMaxScaler()

# Fit the scaler on the training/validation data
X_train_scaled = scaler.fit_transform(X_train)

# Now apply the same scaling to the val/test data using the already fitted scaler
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled data back to DataFrame for consistency
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_valid_scaled = pd.DataFrame(X_valid_scaled, columns=X_valid.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train_scaled.head()

Unnamed: 0,actq,ancq,epsfxq,lctq,ltq,revtq,roa,roe,pe_inc,pcf,...,debt_ebitda_change,gpm_change,evm_change,cfm_change,npm_change,PEG_trailing_change,de_ratio_change,curr_ratio_change,ptb_change,relative_quarterly_return
0,0.15915,0.15647,0.600363,0.14011,0.103656,0.08512,0.812405,0.307713,0.563323,0.503678,...,0.305676,0.28277,0.50565,0.543478,0.892899,0.11462,0.075733,0.263092,0.022231,0.117021
1,0.15915,0.15647,0.600801,0.14011,0.103656,0.08993,0.796067,0.301351,0.555988,0.461638,...,0.305671,0.28277,0.505401,0.541744,0.892514,0.114802,0.075733,0.263092,0.029147,0.11201
2,0.15915,0.15647,0.600278,0.14011,0.103656,0.070774,0.808472,0.301376,0.554554,0.457561,...,0.307639,0.28277,0.508806,0.538162,0.892708,0.114771,0.075733,0.263092,0.027439,0.142569
3,0.15915,0.15647,0.601639,0.14011,0.103656,0.094803,0.788805,0.30214,0.557907,0.474153,...,0.305676,0.28277,0.50565,0.543478,0.892839,0.114597,0.075733,0.263092,0.024716,0.141245
4,0.15915,0.15647,0.602165,0.14011,0.103656,0.099033,0.778517,0.302298,0.5556,0.461406,...,0.305676,0.28277,0.50565,0.543289,0.892851,0.114721,0.075733,0.263092,0.024131,0.125743


In [136]:
# Define the Ridge model
ridge_model = Ridge()

# Define the parameter grid for alpha values
param_grid = {'alpha': [0.0, 0.1, 1.0, 10.0, 100.0, 1000, 10000]}

# Perform GridSearchCV to find the best alpha
grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best alpha
best_alpha = grid_search.best_params_['alpha']
print(f"Best alpha: {best_alpha}")

# Train the Ridge model with the best alpha
ridge_model = Ridge(alpha=best_alpha)
ridge_model.fit(X_train, y_train)

Best alpha: 1000


In [137]:
'''

%%time

parameters = {
    'n_estimators': [300],
    'learning_rate': [0.1],
    'max_depth': [8],
    'gamma': [0.1],
    'random_state': [42]
}

eval_set = [(X_train_scaled, y_train), (X_valid_scaled, y_valid)]
model = xgb.XGBRegressor(eval_set=eval_set, objective='reg:squarederror', verbose=False)
clf = GridSearchCV(model, parameters)

clf.fit(X_train_scaled, y_train)

print(f'Best params: {clf.best_params_}')
print(f'Best validation score = {clf.best_score_}')

'''

"\n\n%%time\n\nparameters = {\n    'n_estimators': [300],\n    'learning_rate': [0.1],\n    'max_depth': [8],\n    'gamma': [0.1],\n    'random_state': [42]\n}\n\neval_set = [(X_train_scaled, y_train), (X_valid_scaled, y_valid)]\nmodel = xgb.XGBRegressor(eval_set=eval_set, objective='reg:squarederror', verbose=False)\nclf = GridSearchCV(model, parameters)\n\nclf.fit(X_train_scaled, y_train)\n\nprint(f'Best params: {clf.best_params_}')\nprint(f'Best validation score = {clf.best_score_}')\n\n"

In [138]:
'''
%%time

model = xgb.XGBRegressor(**clf.best_params_, objective='reg:squarederror')
model.fit(X_train_scaled, y_train, eval_set=eval_set, verbose=False)

'''

"\n%%time\n\nmodel = xgb.XGBRegressor(**clf.best_params_, objective='reg:squarederror')\nmodel.fit(X_train_scaled, y_train, eval_set=eval_set, verbose=False)\n\n"

In [139]:
y_train_pred = ridge_model.predict(X_train_scaled)
y_valid_pred = ridge_model.predict(X_valid_scaled)
y_test_pred = ridge_model.predict(X_test_scaled)

# Classifier
y_train_pred_binary = (y_train_pred >= 0).astype(int)
y_valid_pred_binary = (y_valid_pred >= 0).astype(int)
y_test_pred_binary = (y_test_pred >= 0).astype(int)

y_train_binary = (y_train >= 0).astype(int)
y_valid_binary = (y_valid >= 0).astype(int)
y_test_binary = (y_test >= 0).astype(int)

print(y_test_pred)

print(classification_report(y_train_binary, y_train_pred_binary))



[-0.00701501 -0.00924203 -0.0095882  ... -0.00963132 -0.00896939
 -0.00976189]
              precision    recall  f1-score   support

           0       0.47      1.00      0.64      3460
           1       1.00      0.00      0.00      3936

    accuracy                           0.47      7396
   macro avg       0.73      0.50      0.32      7396
weighted avg       0.75      0.47      0.30      7396



In [140]:
results['y_pred'] = y_test_pred
results1 = results.sort_values(by=['datadate', 'y_pred'], ascending=[True, False])

results1.head(5)



Unnamed: 0,datadate,tic,next_prccq_change,next_relative_quarterly_return,y_pred
1253,2013-03-31,AMD,0.6,0.566207,-0.007015
8319,2013-03-31,LMT,0.123705,0.089912,-0.007054
5084,2013-03-31,EXC,-0.104408,-0.138201,-0.007167
4733,2013-03-31,EQIX,-0.14604,-0.179833,-0.008068
13553,2013-03-31,TMO,0.106419,0.072626,-0.008295


In [141]:
def softmax_weighted_return(df, sort_col, target_col, n=5, largest=True):
    if largest:
        selected = df.nlargest(n, sort_col)
    else:
        selected = df.nsmallest(n, sort_col)
    # Compute softmax weights from the predicted scores:
    e_scores = np.exp(selected[sort_col])
    weights = e_scores / e_scores.sum()
    return (selected[target_col] * weights).sum()

def exponential_weighted_return(df, sort_col, target_col, n=10, largest=True, alpha=0.5):
    if largest:
        selected = df.nlargest(n, sort_col)
    else:
        selected = df.nsmallest(n, sort_col)
    selected = selected.sort_values(by=sort_col, ascending=not largest).copy()
    selected['rank'] = range(1, len(selected) + 1)
    # Exponential weights: weight = exp(-alpha * (rank - 1))
    selected['weight'] = np.exp(-alpha * (selected['rank'] - 1))
    selected['weight'] /= selected['weight'].sum()  # Normalize weights
    return (selected[target_col] * selected['weight']).sum()


# Baseline aggregation remains unchanged.
df_avg = results.groupby('datadate').agg(
    stocks=('next_prccq_change', 'size'),
    baseline=('next_prccq_change', 'mean')
).reset_index()

# For the predicted top 10, weight by predicted returns ('pred_rank_score')
df_avg['top_10'] = results.groupby('datadate').apply(
    lambda group: exponential_weighted_return(
        group,
        sort_col='y_pred',
        target_col='next_prccq_change',
        largest=True
    )
).reset_index(level=0, drop=True)

# For the ideal top 10, weight by the actual returns
df_avg['top_10_ideal'] = results.groupby('datadate').apply(
    lambda group: exponential_weighted_return(
        group,
        sort_col='next_prccq_change',
        target_col='next_prccq_change',
        largest=True
    )
).reset_index(level=0, drop=True)

# For the predicted bottom 10, weight by predicted returns
df_avg['bottom_10'] = results.groupby('datadate').apply(
    lambda group: exponential_weighted_return(
        group,
        sort_col='y_pred',
        target_col='next_prccq_change',
        largest=False
    )
).reset_index(level=0, drop=True)

# For the ideal bottom 10, weight by the actual returns
df_avg['bottom_10_ideal'] = results.groupby('datadate').apply(
    lambda group: exponential_weighted_return(
        group,
        sort_col='next_prccq_change',
        target_col='next_prccq_change',
        largest=False
    )
).reset_index(level=0, drop=True)

# Display the first 100 rows of the new dataframe
df_avg.head(10)


Unnamed: 0,datadate,stocks,baseline,top_10,top_10_ideal,bottom_10,bottom_10_ideal
0,2013-03-31,63,0.065437,0.259747,0.421077,-0.005567,-0.124256
1,2013-06-30,64,0.059808,0.121751,0.354006,0.026997,-0.093581
2,2013-09-30,66,0.110123,0.286719,0.412804,0.050669,-0.106066
3,2013-12-31,66,0.032754,0.13097,0.256265,-0.049666,-0.120828
4,2014-03-31,66,0.039382,0.120052,0.215892,0.01224,-0.098166
5,2014-06-30,67,0.02267,0.140092,0.230542,-0.002973,-0.145873
6,2014-09-30,65,0.054048,0.115345,0.23811,-0.021075,-0.189307
7,2014-12-31,64,0.02715,0.105581,0.206037,-0.011854,-0.152242
8,2015-03-31,65,0.009599,-0.01915,0.32273,-0.06904,-0.169087
9,2015-06-30,63,-0.056546,0.060857,0.143413,-0.123205,-0.309548


In [142]:
for portfolio in ['baseline', 'top_10', 'bottom_10']:

    mean_return = df_avg[portfolio].mean()

    # Calculate the standard deviation of the returns of the top 10 portfolio
    std_return = df_avg[portfolio].std()

    # Assuming the risk-free rate is 0 (if you have a specific risk-free rate, replace 0 with that value)
    risk_free_rate = 0.025

    # Calculate the Sharpe ratio
    sharpe_ratio = (mean_return - risk_free_rate) / std_return

    # Print the Sharpe ratio
    print(f"Mean quarterly return of the {portfolio} portfolio: {mean_return}")
    print(f"Sharpe Ratio of the {portfolio} portfolio: {sharpe_ratio}")
    print("")

Mean quarterly return of the baseline portfolio: 0.03360167684373078
Sharpe Ratio of the baseline portfolio: 0.14378314191983477

Mean quarterly return of the top_10 portfolio: 0.15347418574177824
Sharpe Ratio of the top_10 portfolio: 1.0668510411633492

Mean quarterly return of the bottom_10 portfolio: -0.06317173184377765
Sharpe Ratio of the bottom_10 portfolio: -1.0141071023980754



In [143]:
import plotly.graph_objects as go

# Assuming you have a dataframe 'df_avg' with columns 'datadate', 'baseline_portfolio', and 'top_10'

# Calculate the cumulative return for both portfolios
df_avg['baseline_cum_return'] = (1 + df_avg['baseline']).cumprod()
df_avg['top_10_cum_return'] = (1 + df_avg['top_10']).cumprod()

# Calculate the value over time of a $1,000 investment
initial_investment = 1000
df_avg['baseline_value'] = initial_investment * df_avg['baseline_cum_return']
df_avg['top_10_value'] = initial_investment * df_avg['top_10_cum_return']

# Create the plot
fig = go.Figure()

# Add baseline portfolio trace
fig.add_trace(go.Scatter(x=df_avg['datadate'], y=df_avg['baseline_value'],
                         mode='lines', name='Baseline Portfolio'))

# Add top_10 portfolio trace
fig.add_trace(go.Scatter(x=df_avg['datadate'], y=df_avg['top_10_value'],
                         mode='lines', name='Top 10 Portfolio'))

# Update the layout
fig.update_layout(
    title='Portfolio Value Over Time ($1,000 Investment)',
    xaxis_title='Date',
    yaxis_title='Portfolio Value ($)',
)

# Show the plot
fig.show()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed