In [4]:
import os
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

In [6]:
import pandas as pd
from ta import add_all_ta_features

In [8]:
raw = pd.read_parquet("raw_prices.parquet")

long = (
    raw
    .stack(level=0, future_stack=True)
    .reset_index()
    .rename(columns={"level_1": "Ticker"})
    .sort_values(["Ticker", "Date"])
)

# ---------- next-day return ----------
HORIZON = 1
long["return_fwd"] = (
    long.groupby("Ticker")["Adj Close"]
        .pct_change(HORIZON)
        .shift(-HORIZON)
)
long["direction"] = (long["return_fwd"] > 0).astype(int)
long = long.dropna(subset=["return_fwd"]).reset_index(drop=True)

# ---------- add technical indicators ----------
df = (
    long.groupby("Ticker", group_keys=False)
        .apply(lambda d: add_all_ta_features(
            d,
            open="Open", high="High", low="Low",
            close="Adj Close", volume="Volume",
            fillna=True))
        .reset_index(drop=True)
)

df.to_parquet("prices_feat.parquet")
df.head()

  long.groupby("Ticker", group_keys=False)


Price,Date,Ticker,Open,High,Low,Close,Adj Close,Volume,return_fwd,direction,...,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr
0,2018-01-02,AAPL,42.540001,43.075001,42.314999,43.064999,40.426826,102223600,-0.000174,0,...,0.0,0.0,0.0,0.0,0.0,0.0,40.426826,0.0,0.0,0.0
1,2018-01-03,AAPL,43.1325,43.637501,42.990002,43.057499,40.419773,118071600,0.004645,1,...,-0.001392,-0.000278,-0.001113,1.222687,0.244537,0.978149,40.42381,-0.017447,-0.017449,-0.017447
2,2018-01-04,AAPL,43.134998,43.3675,43.02,43.2575,40.607533,89738400,0.011385,1,...,0.034572,0.006692,0.02788,-0.019422,0.191745,-0.211168,40.50377,0.464524,0.463448,0.446995
3,2018-01-05,AAPL,43.360001,43.842499,43.262501,43.75,41.069847,94640000,-0.003714,0,...,0.153375,0.036028,0.117346,-0.623424,0.028712,-0.652135,40.737074,1.138495,1.132063,1.590579
4,2018-01-08,AAPL,43.587502,43.9025,43.482498,43.587502,40.917316,82271200,-0.000115,0,...,0.214514,0.071725,0.142788,-2.088267,-0.394684,-1.693583,40.812491,-0.371393,-0.372085,1.213278


In [9]:
LOOKBACK = 30
OUTFILE  = "tuning_stock.jsonl"

In [22]:
import json, itertools, os

feat_keep = [
    "momentum_rsi", "trend_macd", "trend_sma_fast", "volatility_atr",
    "momentum_stoch_rsi", "trend_adx", "trend_cci", "volume_obv",
    "volatility_bbh", "volume_cmf"
]
feat_cols = [c for c in df.columns if c in feat_keep]

def row_to_line(date_str, row):
    return date_str + " " + ", ".join(f"{k}:{row[k]:.3g}" for k in feat_cols)

records = 0
with open(OUTFILE, "w", encoding="utf-8") as f:
    for i in range(LOOKBACK, len(df)):
        cur  = df.iloc[i]
        hist = df.iloc[i-LOOKBACK:i]
        if (hist["Ticker"] != cur["Ticker"]).any():
            continue  

        prompt = (
            f"You are a quantitative analyst.\n"
            f"Below are 30 consecutive trading days of technical indicators for {cur['Ticker']}.\n"
            f"Predict the percentage return for the NEXT trading day. "
            f"Respond with a single decimal number (e.g. 0.0035).\n\n" +
            "\n".join(row_to_line(d.strftime('%Y-%m-%d'), r)
                       for d, r in zip(hist['Date'], hist.to_dict('records')))
        )
        json.dump({
            "messages":[
                {"role":"system","content":"You output ONLY the number."},
                {"role":"user","content":prompt},
                {"role":"assistant","content":f"{cur['return_fwd']:.5f}"}
            ]
        }, f, ensure_ascii=False)
        f.write("\n")
        records += 1

print(f"generate {records} samples → {OUTFILE} | size {round(os.path.getsize(OUTFILE)/1e6,2)} MB")
# preview
with open(OUTFILE, "r", encoding="utf-8") as f:
    for line in itertools.islice(f, 2):
        print(line[:280] + " ...\n")

generate 8645 samples → tuning_stock.jsonl | size 56.26 MB
{"messages": [{"role": "system", "content": "You output ONLY the number."}, {"role": "user", "content": "You are a quantitative analyst.\nBelow are 30 consecutive trading days of technical indicators for AAPL.\nPredict the percentage return for the NEXT trading day. Respond with  ...

{"messages": [{"role": "system", "content": "You output ONLY the number."}, {"role": "user", "content": "You are a quantitative analyst.\nBelow are 30 consecutive trading days of technical indicators for AAPL.\nPredict the percentage return for the NEXT trading day. Respond with  ...



In [23]:
import tiktoken, json, itertools
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

with open(OUTFILE, "r", encoding="utf-8") as f:
    first = json.loads(next(f))       
token_count = len(enc.encode(" ".join(m["content"] for m in first["messages"])))
print("tokens in first example:", token_count)

tokens in first example: 2651


In [24]:
import os
print("file size (MB):", round(os.path.getsize(OUTFILE)/1e6, 2))

file size (MB): 56.26


In [34]:
from openai import OpenAI
import time, pprint, os

client = OpenAI()             

# 1) upload
file_obj = client.files.create(
    file=open("tuning_stock_20pct.jsonl", "rb"),
    purpose="fine-tune"
)
training_file_id = file_obj.id
print("Uploaded file_id:", training_file_id, "| status:", file_obj.status)

# 2) processing
while True:
    file_status = client.files.retrieve(training_file_id).status
    print("   current status:", file_status)
    if file_status == "processed":
        break
    time.sleep(5)

print("File processing finished!")

Uploaded file_id: file-NredTMpbsS98Gjcx5hftDM | status: processed
   current status: processed
File processing finished!


In [35]:
BASE_MODEL = "gpt-3.5-turbo-0125"          
EPOCHS      = 2                                            

ft_job = client.fine_tuning.jobs.create(
    training_file  = training_file_id,
    model          = BASE_MODEL,
    hyperparameters= {           
        "n_epochs": EPOCHS
    },
    suffix="stock-return-v1"    
)

print("Started fine-tune job:", ft_job.id)

Started fine-tune job: ftjob-w4Lw9x7PaL6xGzwh0MNP6hI2


In [36]:
import pprint, time
job_id = ft_job.id
while True:
    try:
        job = client.fine_tuning.jobs.retrieve(job_id, timeout=30)
        pprint.pp({"status": job.status, "model": job.fine_tuned_model})
        if job.status in ["succeeded", "failed"]:
            break
    except APITimeoutError:
        print("timeout, retrying …")
    time.sleep(60) 

{'status': 'validating_files', 'model': None}
{'status': 'validating_files', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running', 'model': None}
{'status': 'running'

-----------------------------------------------

In [30]:
import random, json

SRC = "tuning_stock.jsonl"
DST = "tuning_stock_20pct.jsonl"
RATE = 0.20      # 20%

with open(SRC, "r", encoding="utf-8") as fin, \
     open(DST, "w", encoding="utf-8") as fout:
    for line in fin:
        if random.random() < RATE:
            fout.write(line)

print("finished:", DST)

finished: tuning_stock_20pct.jsonl


## Plain-prompt 

In [28]:
from openai import OpenAI
import numpy as np, pandas as pd, time, json, itertools, re
client = OpenAI()     

LOOKBACK = 30

# --- test set: 2024 ---
test_df = df[df["Date"].dt.year == 2024].reset_index(drop=True)

# --- technical indicators ---
feat_cols = [
    "momentum_rsi", "trend_macd", "trend_sma_fast", "volatility_atr",
    "momentum_stoch_rsi", "trend_adx", "trend_cci", "volume_obv",
    "volatility_bbh", "volume_cmf"
]

def row_to_line(date_str, row):
    return date_str + " " + ", ".join(f"{k}:{row[k]:.3g}" for k in feat_cols)

def parse_float(text):
    m = re.search(r"[-+]?\d*\.?\d+[eE]?[-+]?\d*", text)
    return float(m.group()) if m else 0.0

In [None]:
pred_plain = []
for i in range(LOOKBACK, len(test_df)):
    cur  = test_df.iloc[i]
    hist = test_df.iloc[i-LOOKBACK:i]
    if (hist["Ticker"] != cur["Ticker"]).any():
        continue

    prompt = (
      f"You are a quantitative analyst.\n"
      f"Below are 30 consecutive trading days of technical indicators for {cur['Ticker']}.\n"
      f"Predict the percentage return for the NEXT trading day. "
      f"Respond with a single decimal number (e.g. 0.0035).\n\n" +
      "\n".join(row_to_line(d.strftime('%Y-%m-%d'), r)
                for d,r in zip(hist['Date'], hist.to_dict('records')))
    )

    resp = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=[{"role":"user","content":prompt}],
        temperature=0,
        timeout=30
    )
    y_hat = parse_float(resp.choices[0].message.content)
    pred_plain.append((i, y_hat))
    time.sleep(1.2)       

print(f"plain-prompt predictions: {len(pred_plain)} rows")

In [None]:
idx, y_hat = zip(*pred_plain)
y_true = test_df.loc[list(idx), "return_fwd"].values
y_pred = np.array(y_hat)

mse  = np.mean((y_true - y_pred)**2)
rmse = np.sqrt(mse)
signal = np.sign(y_pred); signal[signal==0] = -1
sharpe = (signal * y_true).mean() / (signal * y_true).std(ddof=1) * np.sqrt(252)

print("Plain-prompt ➜", {"MSE": round(mse,6), "RMSE": round(rmse,6), "Sharpe": round(sharpe,3)})