### Imports

In [47]:
import pandas as pd
import datetime
import os
import sys
import math
import matplotlib.pyplot as plt
import numpy as np
import warnings
from pandas.errors import SettingWithCopyWarning


warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

### Parameters

In [48]:
google_colab = False

### Data loading

In [49]:
if google_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    !pip install -U -q PyDrive
    from pydrive.auth import GoogleAuth
    from pydrive.drive import GoogleDrive
    from google.colab import auth
    from oauth2client.client import GoogleCredentials
    # Authenticate and create the PyDrive client.
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)
    id = "15eesIDv1QR3465iekfFG52v-HIYheguO"
    downloaded = drive.CreateFile({'id':id}) 
    downloaded.GetContentFile('11.05 1 mnd test sett full model run_with_TTM0_Q1and2_2018_filtered_missing.csv')  
    df = pd.read_csv('11.05 1 mnd test sett full model run_with_TTM0_Q1and2_2018_filtered_missing.csv')
else:
    # Set the path to the root directory
    path = os.path.abspath(os.path.join(os.getcwd(), '../..'))
    # Read dataframes using Dask
    df = pd.read_csv(path + '/data/predictions/11.05 1 mnd test sett full model run.csv')

In [50]:
print(len(df))

10539487


In [51]:
df['TTM'] = (df['TTM'] * 365).astype(int)

### Data processing

In [52]:
# Dates
df['Expiry_date'] = pd.to_datetime(df['Quote_date']) + pd.to_timedelta(df['TTM'], unit='D')
df['Quote_date'] = pd.to_datetime(df['Quote_date']).dt.date 
df['Expiry_date'] = pd.to_datetime(df['Expiry_date']).dt.date 

# Adding option ID
df["Option_ID"] = df["Expiry_date"].astype(str) + "-" + df["Strike"].astype(str)

df_original = df[(df["Quote_date"] >= pd.to_datetime("2019-01-01").date()) & 
                (df["Quote_date"] <= pd.to_datetime("2020-01-01").date())]

df = df[(df["Expiry_date"] >= pd.to_datetime("2019-07-01").date()) & 
        (df["Expiry_date"] <= pd.to_datetime("2020-01-01").date())]

df = df[(df["Quote_date"] >= pd.to_datetime("2019-07-01").date()) & 
        (df["Quote_date"] <= pd.to_datetime("2020-01-01").date())]

df = df.groupby('Option_ID').apply(lambda x: x.sample(frac=0.02, random_state=1))

In [53]:
len(df)

5558

In [54]:
df_copy = df.copy()

Adding TTM=0 row

In [55]:
# Sort the dataframe by Quote_date and Expiry_date
df = df.sort_values(['Quote_date', 'Expiry_date'])

groups = df.groupby(['Expiry_date', 'Strike'])

for _, group in groups:

    # Sort group so that the last row is the one with the lowest TTM
    group = group.sort_values('TTM', ascending=False)

    # Taking row from option group (could be any) to be used in getting the Strike price
    last_row = group.iloc[-1]
    
    expiry_date = last_row['Expiry_date']
    # Get the underlying price on the day of expiry

    if all(idx in df.index for idx in group.index):
        df = df.drop(group.index)

    underlying_last_on_expiry = df_original.loc[df_original['Quote_date'] == expiry_date, 'Underlying_last'].iloc[0]
    # Calculate the intrinsic value
    intrinsic_value = np.maximum(underlying_last_on_expiry - last_row['Strike'], 0)

    new_row = last_row.copy()
    new_row['Quote_date'] = expiry_date
    new_row['Expiry_date'] = expiry_date
    new_row['TTM'] = 0
    new_row['Underlying_last'] = underlying_last_on_expiry
    new_row['Price'] = intrinsic_value

    df = df.append(new_row, ignore_index=True)

# Sort the dataframe by Quote_date and Expiry_date
df = df.sort_values(['Quote_date', 'Expiry_date'])

IndexError: single positional indexer is out-of-bounds

In [None]:
print(len(df))

395030


### Make sure prices are available every day for an option group

In [None]:
unique_dates = df['Quote_date'].unique()

option_groups = df.groupby(['Option_ID'])

# Remove option groups if it has a date between it's first and last that is not in unique_dates
for option_id, option_group in option_groups:
    group_first_date = option_group['Quote_date'].iloc[0]
    group_last_date = option_group['Quote_date'].iloc[-1]

    if group_first_date not in unique_dates or group_last_date not in unique_dates:
        df = df.drop(option_group.index)
        print("Dropped option group: ", option_id, " because it's first or last date is not in unique_dates")

    # If every unique_dates between group_first_date and group_last_date is not in group, drop the group
    dates_between = unique_dates[(unique_dates >= group_first_date) & (unique_dates <= group_last_date)]
    if len(dates_between) != len(option_group['Quote_date'].unique()):
        df = df.drop(option_group.index)
        print("Dropped option group: ", option_id, " because it's missing dates between it's first and last date")


    if len(option_group) < 2:
        df = df.drop(option_group.index)
        print("Dropped option group: ", option_id, " because it has less than 2 rows")


In [None]:
df.to_csv(path + '/data/predictions/11.05 1 mnd test sett full model run_with_TTM0_Q1and2_2018_filtered_missing.csv', index=False)