In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
from sklearn.metrics import mean_squared_error

In [3]:
model_names = ['LSTM-MLP', 'Heston', 'IV', 'GARCH', 'Rolling']
columns = ['Option_ID', 'Quote_date', 'Price', 'Underlying_last', 'Strike', 'TTM', 'Expiry_date', 'R']

df = pd.read_csv('../data/results/combined_predictions2.csv')
df['Expiry_date'] = pd.to_datetime(df['Quote_date']) + pd.to_timedelta(df['TTM']*365, unit='D')
df["Option_ID"] = df["Expiry_date"].astype(str) + "-" + df["Strike"].astype(str)
df = df[columns + model_names]
df = df.sort_values(by=['Strike', 'Expiry_date', 'Quote_date'])
print(df.head(3))
df.info()

           Option_ID  Quote_date    Price  Underlying_last  Strike       TTM  \
0  2017-12-22-2425.0  2017-11-24  178.250          2602.42  2425.0  0.076712   
1  2017-12-26-2150.0  2017-11-24  451.595          2602.42  2150.0  0.087671   
2  2018-01-05-2170.0  2017-11-24  432.895          2602.42  2170.0  0.115068   

  Expiry_date         R   LSTM-MLP      Heston          IV       GARCH  \
0  2017-12-22  0.011400  181.22504  180.125992  182.984386  179.561340   
1  2017-12-26  0.011454  452.85504  453.536094  460.091426  454.577919   
2  2018-01-05  0.011730  433.30730  434.552391  438.113114  435.346946   

      Rolling  
0  179.539789  
1  454.577919  
2  435.346946  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10539487 entries, 0 to 10539486
Data columns (total 13 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Option_ID        object        
 1   Quote_date       object        
 2   Price            float64       
 3   Underlying_la

In [5]:
# Assuming df is your DataFrame
df_p = df.copy()

df_p = df_p.rename(columns={'LSTM-MLP': 'Prediction'})

# Calculate the actual change in market price
df_p['actual_change'] = df_p.groupby(['Strike', 'Expiry_date'])['Price'].diff().shift(-1)

# Calculate the predicted change in market price
df_p['predicted_change'] = df_p['Prediction'] - df['Price']

# Determine if the actual and predicted changes are in the same direction
df_p['same_direction'] = df_p['actual_change'] * df_p['predicted_change'] > 0

# Drop the last row of each group as it has no tomorrow's data to compare with
df_p = df_p[~df_p.groupby(['Strike', 'Expiry_date'])['Price'].transform(lambda x: x.index == x.index[-1])]

# Calculate the percentage of times the prediction was correct
percentage_correct = df_p['same_direction'].mean() * 100

print(f"The model correctly predicted the direction of the price change {percentage_correct:.2f}% of the time.")


The model correctly predicted the direction of the price change 49.99% of the time.


In [7]:
# Assuming df_a is your DataFrame
df_a = df.copy()
df_a = df_a.rename(columns={'LSTM-MLP': 'Prediction'})
df_a['Relative_pricing_error'] = (df_a['Prediction'] - df_a['Price']) / df_a['Price']

#df_a = df_a[df_a['Relative_pricing_error'] <= -0.1]

# Calculate the actual change in market price for different days ahead
days_ahead = [i for i in range(1, 21)]
for days in days_ahead:
    df_a[f'Actual_change_{days}'] = df_a.groupby(['Strike', 'Expiry_date'])['Price'].shift(-days) - df_a['Price']

# Calculate the predicted change in market price
df_a['Predicted_change'] = df_a['Prediction'] - df_a['Price']

# Determine if the actual and predicted changes are in the same direction
for days in days_ahead:
    df_a[f'Same_direction_{days}'] = df_a[f'Actual_change_{days}'] * df_a['Predicted_change'] > 0

# Calculate the percentage of times the prediction was correct
for days in days_ahead:
    # Only consider rows where we have enough data in the future
    df_a = df_a.dropna(subset=[f'Actual_change_{days}', f'Same_direction_{days}'])
    percentage_correct = df_a[f'Same_direction_{days}'].mean() * 100
    print(f"For a {days}-day horizon, the model correctly predicted the direction of the price change {percentage_correct:.2f}% of the time.")

For a 1-day horizon, the model correctly predicted the direction of the price change 48.62% of the time.
For a 2-day horizon, the model correctly predicted the direction of the price change 48.56% of the time.
For a 3-day horizon, the model correctly predicted the direction of the price change 48.55% of the time.
For a 4-day horizon, the model correctly predicted the direction of the price change 48.54% of the time.
For a 5-day horizon, the model correctly predicted the direction of the price change 48.52% of the time.
For a 6-day horizon, the model correctly predicted the direction of the price change 48.51% of the time.
For a 7-day horizon, the model correctly predicted the direction of the price change 48.51% of the time.
For a 8-day horizon, the model correctly predicted the direction of the price change 48.52% of the time.
For a 9-day horizon, the model correctly predicted the direction of the price change 48.50% of the time.
For a 10-day horizon, the model correctly predicted the

In [55]:
df_a['Predicted_pct_change'] = abs(df_a['Predicted_change'] / df_a['Price'])*100
days = 5

# Calculate the percentage of times the prediction was correct
for threshold in [i for i in range(0, 1000, 5)]:
    # Only consider rows where we have enough data in the future
    df_i = df_a[df_a['Predicted_pct_change'] > threshold]
    percentage_correct = df_i[f'Same_direction_{days}'].mean() * 100
    print(f"With threshold {threshold}, the model correctly predicted the direction of the price change {percentage_correct:.2f}% of the time.")

With threshold 0, the model correctly predicted the direction of the price change 49.78% of the time.
With threshold 5, the model correctly predicted the direction of the price change 51.59% of the time.
With threshold 10, the model correctly predicted the direction of the price change 53.14% of the time.
With threshold 15, the model correctly predicted the direction of the price change 54.15% of the time.
With threshold 20, the model correctly predicted the direction of the price change 54.94% of the time.
With threshold 25, the model correctly predicted the direction of the price change 55.58% of the time.
With threshold 30, the model correctly predicted the direction of the price change 56.05% of the time.
With threshold 35, the model correctly predicted the direction of the price change 56.48% of the time.
With threshold 40, the model correctly predicted the direction of the price change 56.91% of the time.
With threshold 45, the model correctly predicted the direction of the price

In [34]:
import pandas as pd

# Assuming df is your DataFrame
df_a = df.sort_values(by=['Strike', 'Expiry_date', 'Quote_date'])

# Calculate the actual change in market price
df_a['Actual_change'] = df_a.groupby(['Strike', 'Expiry_date'])['Price'].diff()

print(df_a.head(20))

# Calculate whether the predicted change in market price is in the same direction as the actual change
df_a['Same_direction'] = np.sign(df_a['Prediction'].diff()) == np.sign(df_a['Actual_change'])

# Calculate the percentage of times the prediction was correct for each year
df_a['Quote_date'] = pd.to_datetime(df_a['Quote_date'])
df_a['Year'] = df_a['Quote_date'].dt.year

print(df_a.head(20))

for year in range(2015, 2024):
    df_year = df_a[df_a['Year'] == year]
    percentage_correct = df_year['Same_direction'].mean() * 100
    print(f"In {year}, the model correctly predicted the direction of the price change {percentage_correct:.2f}% of the time.")



        Unnamed: 0 Quote_date    Price  Prediction  Underlying_last  Strike  \
954145     2472301 2016-02-11  903.650   909.63666          1828.57   925.0   
955064     2473220 2016-02-11  899.755   906.24960          1828.57   925.0   
956769     2474925 2016-02-11  891.245   897.38837          1828.57   925.0   
956937     2475093 2016-02-11  884.895   888.44543          1828.57   925.0   
957040     2475196 2016-02-11  879.355   881.87634          1828.57   925.0   
957147     2475303 2016-02-11  877.350   878.72327          1828.57   925.0   
957202     2475358 2016-02-11  870.855   867.65344          1828.57   925.0   
957257     2475413 2016-02-11  862.195   861.09094          1828.57   925.0   
542232     2060388 2015-08-24  943.360   938.52110          1893.63   950.0   
545883     2064039 2015-08-25  916.850   912.99530          1868.19   950.0   
542405     2060561 2015-08-24  943.200   938.32370          1893.63   950.0   
546067     2064223 2015-08-25  916.600   912.81445  

In [28]:
# Assuming df is your DataFrame
df_a = df.sort_values(by=['Strike', 'Expiry_date', 'Quote_date'])

# Calculate the actual change in market price
df_a['Actual_change'] = df_a.groupby(['Strike', 'Expiry_date'])['Price'].diff()

# Calculate whether the predicted change in market price is in the same direction as the actual change
df_a['Same_direction'] = np.sign(df_a['Prediction'].diff()) == np.sign(df_a['Actual_change'])

# Split the predictions into positive and negative
df_a['Positive_prediction'] = (df_a['Prediction'].diff() > 0) & (df_a['Actual_change'] > 0)
df_a['Negative_prediction'] = (df_a['Prediction'].diff() < 0) & (df_a['Actual_change'] < 0)

# Calculate the percentage of times the prediction was correct for each year
df_a['Quote_date'] = pd.to_datetime(df_a['Quote_date'])
df_a['Year'] = df_a['Quote_date'].dt.year

for year in range(2015, 2024):
    df_year = df_a[df_a['Year'] == year]
    
    # Overall percentage
    percentage_correct = df_year['Same_direction'].mean() * 100
    print(f"In {year}, the model correctly predicted the direction of the price change {percentage_correct:.2f}% of the time.")
    
    # Positive percentage
    percentage_positive = df_year['Positive_prediction'].mean() * 100
    print(f"In {year}, the model correctly predicted the direction of the price increase {percentage_positive:.2f}% of the time.")
    
    # Negative percentage
    percentage_negative = df_year['Negative_prediction'].mean() * 100
    print(f"In {year}, the model correctly predicted the direction of the price decrease {percentage_negative:.2f}% of the time.")

# Overall percentage
percentage_correct = df_a['Same_direction'].mean() * 100
print(f"In total, the model correctly predicted the direction of the price change {percentage_correct:.2f}% of the time.")
    
# Positive percentage
percentage_positive = df_a['Positive_prediction'].mean() * 100
print(f"In total, the model correctly predicted the direction of the price increase {percentage_positive:.2f}% of the time.")
    
# Negative percentage
percentage_negative = df_a['Negative_prediction'].mean() * 100
print(f"In total, the model correctly predicted the direction of the price decrease {percentage_negative:.2f}% of the time.")


In 2015, the model correctly predicted the direction of the price change 84.11% of the time.
In 2015, the model correctly predicted the direction of the price increase 38.51% of the time.
In 2015, the model correctly predicted the direction of the price decrease 44.30% of the time.
In 2016, the model correctly predicted the direction of the price change 80.41% of the time.
In 2016, the model correctly predicted the direction of the price increase 40.28% of the time.
In 2016, the model correctly predicted the direction of the price decrease 38.49% of the time.
In 2017, the model correctly predicted the direction of the price change 77.29% of the time.
In 2017, the model correctly predicted the direction of the price increase 41.82% of the time.
In 2017, the model correctly predicted the direction of the price decrease 33.65% of the time.
In 2018, the model correctly predicted the direction of the price change 78.30% of the time.
In 2018, the model correctly predicted the direction of th

In [31]:
# Add this after the 'Positive_prediction' and 'Negative_prediction' columns are created
df_a['No_change_prediction'] = (df_a['Prediction'].diff() == 0) & (df_a['Actual_change'] == 0)

# Add this inside the for loop, after the 'Negative_prediction' percentage is calculated
percentage_no_change = df_a['No_change_prediction'].mean() * 100
print(f"In total, the model correctly predicted no change in price {percentage_no_change:.2f}% of the time.")


In total, the model correctly predicted no change in price 1.54% of the time.


In [26]:
# Assuming df is your DataFrame
df_a = df.sort_values(by=['Strike', 'Expiry_date', 'Quote_date'])

# Calculate the actual change in market price
df_a['Actual_change'] = df_a.groupby(['Strike', 'Expiry_date'])['Price'].diff()

# Calculate whether the predicted change in market price is in the same direction as the actual change
df_a['Same_direction'] = np.sign(df_a['Prediction'].diff()) == np.sign(df_a['Actual_change'])

percentage_correct = df_a['Same_direction'].mean() * 100
print(f"The model correctly predicted the direction of the price change {percentage_correct:.2f}% of the time.")


The model correctly predicted the direction of the price change 82.21% of the time.


In [23]:
import pandas as pd
import numpy as np

# Assuming df_a is your DataFrame
df_b = df.sort_values(by=['Strike', 'Expiry_date', 'Quote_date'])

# Calculate the actual percentage change in market price for different days ahead
days_ahead = [1, 5, 10, 20]
for days in days_ahead:
    df_b[f'Actual_pct_change_{days}'] = df_b.groupby(['Strike', 'Expiry_date'])['Price'].pct_change(-days)

# Calculate the predicted percentage change in market price
df_b['Predicted_pct_change'] = (df_b['Prediction'] - df_b['Price']) / df_b['Price']

# Calculate the difference between the actual and predicted percentage changes
for days in days_ahead:
    df_b[f'Prediction_error_{days}'] = df_b[f'Predicted_pct_change'] - df_b[f'Actual_pct_change_{days}']

# Calculate the mean squared error of the predictions
for days in days_ahead:
    # Only consider rows where we have enough data in the future
    df_b = df_b.dropna(subset=[f'Actual_pct_change_{days}', f'Prediction_error_{days}'])
    mse = np.mean(df_b[f'Prediction_error_{days}']**2)
    print(f"For a {days}-day horizon, the mean squared error of the percentage change predictions is {mse:.5f}.")


For a 1-day horizon, the mean squared error of the percentage change predictions is 12.03828.
For a 5-day horizon, the mean squared error of the percentage change predictions is 406.72524.
For a 10-day horizon, the mean squared error of the percentage change predictions is 2913.63418.
For a 20-day horizon, the mean squared error of the percentage change predictions is 14855.40632.


In [24]:
import pandas as pd
import numpy as np

# Assuming df_b is your DataFrame
df_b = df.sort_values(by=['Strike', 'Expiry_date', 'Quote_date'])

# Calculate the actual absolute change in market price for different days ahead
days_ahead = [1, 5, 10, 20]
for days in days_ahead:
    df_b[f'Actual_change_{days}'] = df_b.groupby(['Strike', 'Expiry_date'])['Price'].shift(-days) - df_b['Price']

# Calculate the predicted absolute change in market price
df_b['Predicted_change'] = df_b['Prediction'] - df_b['Price']

# Calculate the difference between the actual and predicted absolute changes
for days in days_ahead:
    df_b[f'Prediction_error_{days}'] = df_b['Predicted_change'] - df_b[f'Actual_change_{days}']

# Calculate the root mean squared error of the predictions
for days in days_ahead:
    # Only consider rows where we have enough data in the future
    df_b = df_b.dropna(subset=[f'Actual_change_{days}', f'Prediction_error_{days}'])
    rmse = np.sqrt(np.mean(df_b[f'Prediction_error_{days}']**2))
    print(f"For a {days}-day horizon, the root mean squared error of the absolute change predictions is {rmse:.5f}.")


For a 1-day horizon, the root mean squared error of the absolute change predictions is 30.22283.
For a 5-day horizon, the root mean squared error of the absolute change predictions is 59.04197.
For a 10-day horizon, the root mean squared error of the absolute change predictions is 80.81865.
For a 20-day horizon, the root mean squared error of the absolute change predictions is 113.63054.


In [None]:
# Assuming df_a is your DataFrame
df_a = df.copy()

model_names = ['LSTM-MLP', 'Heston', 'IV', 'GARCH', 'Rolling']

# Iterate over the models
for model_name in model_names:
    print(f"For model: {model_name}")

    # Rename the prediction column
    df_a = df_a.rename(columns={model_name: 'Prediction'})

    # Calculate the predicted change and relative pricing error
    df_a['Predicted_change'] = (df_a['Prediction'] - df_a['Price'])
    df_a['Relative_pricing_error'] = df_a['Predicted_change'] / df_a['Price']

    # Define a list of thresholds
    thresholds = [0.01]

    # Calculate the actual change in market price for different days ahead
    days_ahead = [5]
    for days in days_ahead:
        df_a[f'Actual_change_{days}'] = df_a.groupby(['Option_ID'])['Price'].shift(-days) - df_a['Price']

    # Iterate over the thresholds
    for threshold in thresholds:
        print(f"For threshold {threshold * 100:.2f}%:")

        # Filter instances based on the threshold
        filtered_df = df_a[abs(df_a['Relative_pricing_error']) > threshold]

        # Determine if the actual and predicted changes are in the same direction
        for days in days_ahead:
            filtered_df[f'Same_direction_{days}'] = filtered_df[f'Actual_change_{days}'] * filtered_df['Predicted_change'] > 0

        # Calculate the percentage of times the prediction was correct
        for days in days_ahead:
            # Only consider rows where we have enough data in the future
            filtered_df = filtered_df.dropna(subset=[f'Actual_change_{days}', f'Same_direction_{days}'])

            # Calculate the number of up, down, and still predictions
            correct_predictions = filtered_df[f'Same_direction_{days}'].sum()
            up_predictions = filtered_df[filtered_df[f'Actual_change_{days}'] > 0][f'Same_direction_{days}'].sum()
            down_predictions = filtered_df[filtered_df[f'Actual_change_{days}'] < 0][f'Same_direction_{days}'].sum()

            # Calculate the total up, down, and still predictions
            total_predictions = filtered_df[f'Same_direction_{days}'].count()
            total_up_predictions = filtered_df[filtered_df[f'Actual_change_{days}'] > 0][f'Same_direction_{days}'].count()
            total_down_predictions = filtered_df[filtered_df[f'Actual_change_{days}'] < 0][f'Same_direction_{days}'].count()

            # Calculate the percentages of correct up, down, and still predictions
            percentage_correct = correct_predictions / total_predictions * 100
            percentage_up = up_predictions / total_up_predictions * 100
            percentage_down = down_predictions / total_down_predictions * 100

            print(f"For a {days}-day horizon:")
            print(f"  Total percentage of correct predictions: {percentage_correct:.2f}%")
            print(f"  Percentage of correct UP predictions: {percentage_up:.2f}%")
            print(f"  Percentage of correct DOWN predictions: {percentage_down:.2f}%")
            print()
