# Test Modeller for SPY Option Calls

In [179]:
# %pip install openai

In [180]:
# Load in data
import pandas as pd
import base64

DATA_PATH='data/raw.csv'

data = pd.read_csv(DATA_PATH)
data['text'] = data['text'].apply(lambda x: base64.b64decode(x).decode())
data

Unnamed: 0,date,text,one_day_call,one_day_put,two_day_call,two_day_put
0,2024/05/24 13:19:00,Trends Takeaways this Week\r\n\r\nPublished Ma...,,,,
1,2024/05/23 13:30:00,"Cost of Living\r\n\r\nPublished May 23, 2024\r...",,,,
2,2024/05/22 12:06:00,"Cosmetic Procedures\r\n\r\nPublished May 22, 2...",2.20,-6.14,-0.59,-3.98
3,2024/05/21 14:42:00,"All Things Summer\r\n\r\nPublished May 21, 202...",1.40,-2.38,3.08,-5.26
4,2024/05/20 13:02:00,"Graduation 🎓\r\n\r\nPublished May 20, 2024\r\n...",0.35,-2.09,0.21,-3.57
...,...,...,...,...,...,...
56,2024/03/01 13:05:00,Making Friends • Loneliness\r\n\r\nPublished M...,3.94,1.74,0.53,-5.35
57,2024/02/29 13:19:00,"Dementia\r\n\r\nPublished February 29, 2024\r\...",5.62,0.89,6.53,4.33
58,2024/02/28 13:03:00,IVF & Reproductive Health\r\n\r\nPublished Feb...,3.97,-0.42,7.52,2.79
59,2024/02/27 12:38:00,Michigan Primary elections\r\n\r\nPublished Fe...,1.51,-0.39,4.39,0.00


In [181]:
import sqlite3
con = sqlite3.connect("data/data.db")
cur = con.cursor()
cur.execute("DROP TABLE trends")
cur.execute("CREATE TABLE IF NOT EXISTS trends(date PRIMARY KEY, text, embeddings, one_day_call, one_day_put, two_day_call, two_day_put)")

<sqlite3.Cursor at 0x16df5e8c0>

In [182]:
cur.execute("SELECT * FROM trends")
db_data = cur.fetchall()
db_data_df = pd.DataFrame(db_data, columns=["date", "text", "embeddings", "one_day_call", "one_day_put", "two_day_call", "two_day_put"])


In [183]:
common_columns = list(set(data.columns) & set(db_data_df.columns))
csv_data_common = data[common_columns]
db_data_common = db_data_df[common_columns]
deltas = csv_data_common.merge(db_data_common, how='outer', indicator=True).loc[lambda x: x['_merge'] != 'both']
deltas

Unnamed: 0,two_day_put,one_day_put,two_day_call,one_day_call,date,text,_merge
0,-11.72,-6.22,0.00,0.52,2024/04/11 13:00:00,"Taxes • Finances\r\n\r\nPublished April 11, 20...",left_only
1,-10.07,-7.96,-1.75,-0.38,2024/04/29 12:53:00,"Puzzles & Games\r\n\r\nPublished April 29, 202...",left_only
2,-9.59,-8.22,-5.30,3.50,2024/04/12 12:28:00,Mercury Retrograde • Astrology\r\n\r\nPublishe...,left_only
3,-7.98,-2.34,-1.20,1.90,2024/04/23 12:51:00,"Gardening🌷\r\n\r\nPublished April 23, 2024\r\n...",left_only
4,-7.76,-5.50,-3.69,0.90,2024/04/05 13:02:00,"Taiwan Earthquake\r\n\r\nPublished April 5, 20...",left_only
...,...,...,...,...,...,...,...
56,7.82,4.46,10.80,9.36,2024/05/14 13:11:00,"WNBA Tip-Off 🏀\r\n\r\nPublished May 14, 2024\r...",left_only
57,8.14,-0.87,12.13,5.47,2024/05/01 13:00:00,Honoring Asian and Pasifika Ancestry Month‬‬\r...,left_only
58,10.96,6.22,14.27,10.21,2024/05/02 12:50:00,"Campus Protests\r\n\r\nPublished May 2, 2024\r...",left_only
59,,,,,2024/05/23 13:30:00,"Cost of Living\r\n\r\nPublished May 23, 2024\r...",left_only


In [184]:
from openai import OpenAI
client = OpenAI()

def get_embeddings(text):
  response = client.embeddings.create(
      input=text,
      model="text-embedding-3-small"
  )
  return response.data[0].embedding

deltas['embeddings'] = deltas['text'].apply(lambda x: get_embeddings(x))
deltas

Unnamed: 0,two_day_put,one_day_put,two_day_call,one_day_call,date,text,_merge,embeddings
0,-11.72,-6.22,0.00,0.52,2024/04/11 13:00:00,"Taxes • Finances\r\n\r\nPublished April 11, 20...",left_only,"[0.008701847866177559, 0.024174364283680916, 0..."
1,-10.07,-7.96,-1.75,-0.38,2024/04/29 12:53:00,"Puzzles & Games\r\n\r\nPublished April 29, 202...",left_only,"[0.0016986075788736343, 0.016070304438471794, ..."
2,-9.59,-8.22,-5.30,3.50,2024/04/12 12:28:00,Mercury Retrograde • Astrology\r\n\r\nPublishe...,left_only,"[-0.01705741696059704, 0.01831475831568241, 0...."
3,-7.98,-2.34,-1.20,1.90,2024/04/23 12:51:00,"Gardening🌷\r\n\r\nPublished April 23, 2024\r\n...",left_only,"[0.01311933621764183, 0.02451184019446373, 0.0..."
4,-7.76,-5.50,-3.69,0.90,2024/04/05 13:02:00,"Taiwan Earthquake\r\n\r\nPublished April 5, 20...",left_only,"[0.005697894841432571, 0.0003969444951508194, ..."
...,...,...,...,...,...,...,...,...
56,7.82,4.46,10.80,9.36,2024/05/14 13:11:00,"WNBA Tip-Off 🏀\r\n\r\nPublished May 14, 2024\r...",left_only,"[0.00730225769802928, 0.019008571282029152, 0...."
57,8.14,-0.87,12.13,5.47,2024/05/01 13:00:00,Honoring Asian and Pasifika Ancestry Month‬‬\r...,left_only,"[0.008942820131778717, 0.023462722077965736, 0..."
58,10.96,6.22,14.27,10.21,2024/05/02 12:50:00,"Campus Protests\r\n\r\nPublished May 2, 2024\r...",left_only,"[0.024588940665125847, 0.019006017595529556, 0..."
59,,,,,2024/05/23 13:30:00,"Cost of Living\r\n\r\nPublished May 23, 2024\r...",left_only,"[0.0012548139784485102, 0.02510807290673256, 0..."


In [185]:
deltas.embeddings = deltas.embeddings.apply(lambda x: str(x))
deltas.drop(columns='_merge').to_sql('trends', con, if_exists='append', index=False)

61

In [186]:
cur.execute("SELECT * FROM trends where one_day_call IS NOT NULL")
db_data = cur.fetchall()
db_data_df = pd.DataFrame(db_data, columns=["date", "text", "embeddings", "one_day_call", "one_day_put", "two_day_call", "two_day_put"])
cur.close()
con.close()

In [187]:
import numpy as np
from ast import literal_eval
# embeddings saved as a string into db - convert back to np array
db_data_df.embeddings = db_data_df.embeddings.apply(lambda x: np.array(literal_eval(x)))

In [188]:

# Create a new column that contains the combined values from the last 4 columns
DATA_1D_LABEL = 'one_day_labels'
DATA_2D_LABEL = 'two_day_labels'
db_data_df[DATA_1D_LABEL] = db_data_df[["one_day_call", "one_day_put"]].apply(lambda row: np.array(row), axis=1)
db_data_df[DATA_2D_LABEL] = db_data_df[["two_day_call", "two_day_put"]].apply(lambda row: np.array(row), axis=1)
db_data_df

Unnamed: 0,date,text,embeddings,one_day_call,one_day_put,two_day_call,two_day_put,one_day_labels,two_day_labels
0,2024/04/11 13:00:00,"Taxes • Finances\r\n\r\nPublished April 11, 20...","[0.008701847866177559, 0.024174364283680916, 0...",0.52,-6.22,0.0,-11.72,"[0.52, -6.22]","[0.0, -11.72]"
1,2024/04/29 12:53:00,"Puzzles & Games\r\n\r\nPublished April 29, 202...","[0.0016986075788736343, 0.016070304438471794, ...",-0.38,-7.96,-1.75,-10.07,"[-0.38, -7.96]","[-1.75, -10.07]"
2,2024/04/12 12:28:00,Mercury Retrograde • Astrology\r\n\r\nPublishe...,"[-0.01705741696059704, 0.01831475831568241, 0....",3.5,-8.22,-5.3,-9.59,"[3.5, -8.22]","[-5.3, -9.59]"
3,2024/04/23 12:51:00,"Gardening🌷\r\n\r\nPublished April 23, 2024\r\n...","[0.01311933621764183, 0.02451184019446373, 0.0...",1.9,-2.34,-1.2,-7.98,"[1.9, -2.34]","[-1.2, -7.98]"
4,2024/04/05 13:02:00,"Taiwan Earthquake\r\n\r\nPublished April 5, 20...","[0.005697894841432571, 0.0003969444951508194, ...",0.9,-5.5,-3.69,-7.76,"[0.9, -5.5]","[-3.69, -7.76]"
5,2024/04/04 13:59:00,Political Issues in Florida\r\n\r\nPublished A...,"[-0.003175552701577544, 0.033360254019498825, ...",-1.42,-7.85,-1.11,-7.51,"[-1.42, -7.85]","[-1.11, -7.51]"
6,2024/04/17 13:21:00,"March Beauty Report\r\n\r\nPublished April 17,...","[0.0073063489980995655, 0.03637780249118805, 0...",2.81,-2.76,-0.86,-7.46,"[2.81, -2.76]","[-0.86, -7.46]"
7,2024/03/13 13:23:00,"Personality Tests\r\n\r\nPublished March 13, 2...","[0.004279895685613155, 0.012967271730303764, 0...",1.72,-3.59,-3.71,-7.29,"[1.72, -3.59]","[-3.71, -7.29]"
8,2024/04/26 12:39:00,College Admissions and Decisions\r\n\r\nPublis...,"[0.01751658506691456, 0.022979073226451874, 0....",1.53,-1.97,0.34,-7.24,"[1.53, -1.97]","[0.34, -7.24]"
9,2024/04/30 12:23:00,"Summer Travel\r\n\r\nPublished April 30, 2024\...","[0.01947743073105812, 0.03628331422805786, 0.0...",1.42,-6.9,-0.88,-7.22,"[1.42, -6.9]","[-0.88, -7.22]"


In [189]:
TRAIN_TEST_RATIO = 0.2

def split_dataset(X, y1, y2, test_ratio):
    shuffled_indices = np.random.permutation(len(X))
    test_set_size = int(len(X) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return list(X.iloc[train_indices].values), y1.iloc[train_indices].to_numpy(), y2.iloc[train_indices].to_numpy(), list(X.iloc[test_indices].values), y1.iloc[test_indices].to_numpy(), y2.iloc[test_indices].to_numpy()
X_train, t1_train, t2_train, X_test, t1_test, t2_test = split_dataset(db_data_df.embeddings, db_data_df[DATA_1D_LABEL], db_data_df[DATA_2D_LABEL],  TRAIN_TEST_RATIO)

#print(f"X_train.shape:\t{X_train.shape}")
print(f"t1_train.shape:\t{t1_train.shape}")
#print(f"X_test.shape:\t{X_test.shape}")
print(f"t1_test.shape:\t{t1_test.shape}")

t1_train.shape:	(48,)
t1_test.shape:	(11,)


In [190]:
t1_train_reshaped = np.vstack(t1_train)
t1_train_reshaped.shape
t1_test_reshaped = np.vstack(t1_test)
t1_test_reshaped.shape

(11, 2)

In [191]:
t2_train_reshaped = np.vstack(t2_train)
t2_train_reshaped.shape
t2_test_reshaped = np.vstack(t2_test)
t2_test_reshaped.shape

(11, 2)

In [192]:
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.svm import LinearSVR, SVR

# Feature Extraction
pipe = Pipeline([
  ('reg', MultiOutputRegressor(estimator=HistGradientBoostingRegressor()))
])

In [193]:
X_train

[array([ 0.01947743,  0.03628331,  0.04793249, ..., -0.00169107,
         0.04690737, -0.01396348]),
 array([ 0.0146922 ,  0.01633583,  0.04917976, ..., -0.0045146 ,
         0.02499182, -0.00599674]),
 array([ 0.02804204,  0.022976  ,  0.03614771, ..., -0.00217914,
         0.04011114, -0.00888793]),
 array([ 0.02867993,  0.02555728,  0.03571349, ...,  0.00418375,
         0.04080676, -0.02930143]),
 array([ 0.02325836,  0.02092204,  0.03759079, ..., -0.0182862 ,
         0.01462446, -0.02072735]),
 array([ 0.02240938,  0.03857428,  0.04112744, ..., -0.00660208,
         0.04767953, -0.02033301]),
 array([ 0.00730226,  0.01900857,  0.05072988, ..., -0.00411651,
         0.03710909, -0.01213007]),
 array([0.01318945, 0.02877971, 0.05020695, ..., 0.00015333, 0.02231253,
        0.00159523]),
 array([ 0.01632926,  0.00455574,  0.06329917, ..., -0.01525824,
         0.00757091,  0.00034003]),
 array([ 0.0119659 ,  0.02989165,  0.02724283, ..., -0.01938877,
         0.03615951, -0.02162178

In [194]:
from sklearn.metrics import make_scorer

def mse(y_true, y_pred):
  diff = np.square(y_true - y_pred)
  return np.mean(diff, axis=0)

def mse_scorer(y_true, y_pred):
  diff = np.square(y_true - y_pred)
  return np.mean(diff)

score = make_scorer(mse_scorer, greater_is_better=False)

In [195]:
from sklearn.model_selection import GridSearchCV
ESTIMATORS = [
  RandomForestRegressor() # test a bunch of hyper paramters with small sample size default was fine
  ]
param_grid = [
    {
        "reg__estimator": ESTIMATORS
    }
]
grid1 = GridSearchCV(pipe, param_grid=param_grid, scoring=score)
grid1.fit(X_train, t1_train_reshaped)

grid2 = GridSearchCV(pipe, param_grid=param_grid, scoring=score)
grid2.fit(X_train, t2_train_reshaped)

In [196]:
model1 = grid1.best_estimator_
model2 = grid2.best_estimator_
"""
Test Mean Squared Error:	[12.89855858 12.30937937 17.67518915 33.93445874]
Test Score:	-19.204396459383766
Train Mean Squared Error:	[ 8.16264177 13.24241248 19.4484513  28.6775945 ]
Train Score:	-17.38277501221715
"""

print(f"Test Mean Squared Error:\t{mse(t1_test_reshaped, model1.predict(X_test))}")
print(f"Test Score:\t{score(model1, X_test, t1_test_reshaped)}")
print(f"Train Mean Squared Error:\t{mse(t1_train_reshaped, model1.predict(X_train))}")
print(f"Train Score:\t{score(model1, X_train, t1_train_reshaped)}")

Test Mean Squared Error:	[10.97789999 10.78498987]
Test Score:	-10.881444929999999
Train Mean Squared Error:	[1.16464004 1.81331068]
Train Score:	-1.4889753577083322


In [197]:
# Predict
filepath1 = "data/1d-outlook.txt"
filepath2 = "data/2d-outlook.txt" 

with open(filepath1, 'r') as file:
    predict_data1 = base64.b64decode(file.read()).decode()
with open(filepath2, 'r') as file:
    predict_data2 = base64.b64decode(file.read()).decode()
predict_data1 = get_embeddings(predict_data1)
predict_data2 = get_embeddings(predict_data2)

In [198]:
model1.predict(np.array(predict_data1).reshape(1,-1))

array([[ 3.0613, -1.7401]])

In [199]:
model2.predict(np.array(predict_data1).reshape(1,-1))

array([[ 2.2462, -1.3931]])

In [200]:
model1.predict(np.array(predict_data2).reshape(1,-1))

array([[ 2.9679, -1.1166]])

In [201]:
model2.predict(np.array(predict_data2).reshape(1,-1))

array([[ 2.4855, -2.1965]])