### Imports

In [8]:
import pandas as pd
import datetime
import os
import sys
import math
import matplotlib.pyplot as plt
import numpy as np
import warnings
from pandas.errors import SettingWithCopyWarning


warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)

### Parameters

In [9]:
google_colab = False

### Data loading

In [10]:
if google_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    !pip install -U -q PyDrive
    from pydrive.auth import GoogleAuth
    from pydrive.drive import GoogleDrive
    from google.colab import auth
    from oauth2client.client import GoogleCredentials
    # Authenticate and create the PyDrive client.
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)
    id = "1-kLmDPAmiTHNw_cv3lr6wY9DfSt8RSHv"
    downloaded = drive.CreateFile({'id':id}) 
    downloaded.GetContentFile('05-07_11-26.csv')  
    df = pd.read_csv('05-07_11-26.csv')
else:
    # Set the path to the root directory
    path = os.path.abspath(os.path.join(os.getcwd(), '../..'))
    # Read dataframes using Dask
    df = pd.read_csv(path + '/data/predictions/11.05 1 mnd test sett full model run.csv')

### Data processing

In [11]:
# Dates
df["TTM"] = (df["TTM"] * 365).astype(int)
df['Expiry_date'] = pd.to_datetime(df['Quote_date']) + pd.to_timedelta(df['TTM'], unit='D')
df['Quote_date'] = pd.to_datetime(df['Quote_date'])
df['Expiry_date'] = pd.to_datetime(df['Expiry_date'])

# Adding option ID
df["Option_ID"] = df["Expiry_date"].astype(str) + "-" + df["Strike"].astype(str)

# Dataframe for underlying matching in TTM = 0 rows
df_original = df[(df["Quote_date"] >= "2015-01-01") & (df["Quote_date"] <= "2023-03-31")]

# Period to be traded on
df = df[(df["Expiry_date"] >= "2015-01-01") & (df["Expiry_date"] <= "2023-03-31")]
df = df[(df["Quote_date"] >= "2015-01-01") & (df["Quote_date"] <= "2023-03-31")]

Adding TTM=0 row

In [12]:
# Sort the dataframe by Quote_date and Expiry_date
df = df.sort_values(['Quote_date', 'Expiry_date'])

groups = df.groupby(['Expiry_date', 'Strike'])

for _, group in groups:

    # Sort group so that the last row is the one with the lowest TTM
    group = group.sort_values('TTM', ascending=False)

    # Taking row from option group (could be any) to be used in getting the Strike price
    last_row = group.iloc[-1]
    
    expiry_date = last_row['Expiry_date']

    # Get the underlying price on the day of expiry
    underlying_last_on_expiry = df_original.loc[df_original['Quote_date'] == expiry_date, 'Underlying_last'].iloc[0]
    # Calculate the intrinsic value
    intrinsic_value = np.maximum(underlying_last_on_expiry - last_row['Strike'], 0)

    new_row = last_row.copy()
    new_row['Quote_date'] = expiry_date
    new_row['Expiry_date'] = expiry_date
    new_row['TTM'] = 0
    new_row['Underlying_last'] = underlying_last_on_expiry
    new_row['Price'] = intrinsic_value

    df = df.append(new_row, ignore_index=True)

# Sort the dataframe by Quote_date and Expiry_date
df = df.sort_values(['Quote_date', 'Expiry_date'])

  df = df.append(new_row, ignore_index=True)


In [None]:
unique_dates = df['Quote_date'].unique()

option_groups = df.groupby(['Option_ID'])

# Remove option groups if it has a date between it's first and last that is not in unique_dates
for option_id, option_group in option_groups:
    group_first_date = option_group['Quote_date'].iloc[0]
    group_last_date = option_group['Quote_date'].iloc[-1]

    if group_first_date not in unique_dates or group_last_date not in unique_dates:
        df = df.drop(option_group.index)
        print("Dropped option group: ", option_id, " because it's first or last date is not in unique_dates")

    # If every unique_dates between group_first_date and group_last_date is not in group, drop the group
    dates_between = unique_dates[(unique_dates >= group_first_date) & (unique_dates <= group_last_date)]
    if len(dates_between) != len(option_group['Quote_date'].unique()):
        df = df.drop(option_group.index)
        print("Dropped option group: ", option_id, " because it's missing dates between it's first and last date")


    if len(option_group) < 2:
        df = df.drop(option_group.index)
        print("Dropped option group: ", option_id, " because it has less than 2 rows")


  for option_id, option_group in option_groups:


Dropped option group:  2018-01-19-1350.0  because it's missing dates between it's first and last date
Dropped option group:  2018-01-19-1375.0  because it's missing dates between it's first and last date
Dropped option group:  2018-01-19-1400.0  because it's missing dates between it's first and last date
Dropped option group:  2018-01-19-3500.0  because it's missing dates between it's first and last date
Dropped option group:  2018-01-26-1400.0  because it's missing dates between it's first and last date


In [None]:
display(df)

Unnamed: 0.1,Unnamed: 0,Quote_date,Price,Prediction,Underlying_last,Strike,TTM,R,Expiry_date,Option_ID
0,4119222,2018-01-02,1095.295,1102.66750,2695.87,1600.0,0.00274,0.0129,2018-01-03,2018-01-03-1600.0
1,4119223,2018-01-02,995.295,1002.39465,2695.87,1700.0,0.00274,0.0129,2018-01-03,2018-01-03-1700.0
2,4119224,2018-01-02,945.300,952.23970,2695.87,1750.0,0.00274,0.0129,2018-01-03,2018-01-03-1750.0
3,4119225,2018-01-02,895.305,902.30440,2695.87,1800.0,0.00274,0.0129,2018-01-03,2018-01-03-1800.0
4,4119226,2018-01-02,845.290,852.40234,2695.87,1850.0,0.00274,0.0129,2018-01-03,2018-01-03-1850.0
...,...,...,...,...,...,...,...,...,...,...
19206,4190191,2018-01-29,0.000,0.00000,2853.21,3010.0,0.00000,0.0124,2018-01-29,2018-01-29-3010.0
19207,4190192,2018-01-29,0.000,0.00000,2853.21,3020.0,0.00000,0.0124,2018-01-29,2018-01-29-3020.0
19208,4190193,2018-01-29,0.000,0.00000,2853.21,3050.0,0.00000,0.0124,2018-01-29,2018-01-29-3050.0
19209,4190194,2018-01-29,0.000,0.00000,2853.21,3100.0,0.00000,0.0124,2018-01-29,2018-01-29-3100.0


In [None]:
# Write to csv
df.to_csv(path + '/data/trading/Full trading data - LSTM-MLP.csv', index=False)