In [52]:
%matplotlib widget
import pynance
from pathlib import Path
import torch
import sklearn
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

saving_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M")

In [None]:
# next line should be commented
data_path = pynance.utils.user.get_path_to_data()
results_path = pynance.utils.user.get_path_to_results()
results_path.mkdir(parents=True, exist_ok=True)

## Import data using the Data module


In [None]:
start_date = '2015-01-01' # '1999-01-01'
end_date = '2023-01-01'
market = '^IXIC'
x = pynance.data.readers.read_txt('tech_us') + [market]
dict_stocks = pynance.data.readers.get_financial_datas(x, start = start_date, end=end_date, conversion = True)
df_market = dict_stocks[market]

In [None]:
df_market

## Market stock future prediction

Demo notebook. Naive training, plotting etc.

### Define dataset

In [None]:
path = data_path / f"cac40_norm.csv" # cac40 with Close "normalize" (x - mean)/std
dtype = torch.float
device = torch.device("cpu")
window = 100
batch_size = 16
ratio = 0.8
return_type = "torch"

data_creator = pynance.utils.datasets.creators.StockValuePredictionDatasetCreator(df_market)
train_set, valid_set = data_creator.get_train_sets(ratio=ratio, return_type=return_type, window=window)

collater = pynance.utils.datasets.collaters.TimeSeriesCollater(dtype=dtype, device=device)

training_loader = torch.utils.data.DataLoader(
                            train_set,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=4,
                            collate_fn=collater)
validation_loader = torch.utils.data.DataLoader(
                            valid_set,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=4,
                            collate_fn=collater)

### Models

In [None]:
# defining the RNN model
rnn = pynance.model.forecasting.TFnaive(
    input_size=1,
    hidden_size=2,
    num_layers=2
).to(device=device, dtype=dtype)


### Training

In [None]:
# Optim
loss_function = torch.nn.MSELoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001)

In [None]:
pynance.utils.train.train(
    epochs=10,
    model=rnn,
    loss_fn=loss_function,
    training_loader=training_loader,
    validation_loader=validation_loader,
    optimizer=optimizer,
    saving_path=results_path,
    saving_name="nasdaq_1"
)

### "Test" the model

In [None]:
state_dict_path = results_path/"model_state_dict_nasdaq_1_20230108_122337_9.pt"

df = df_market # pd.read_csv(path, parse_dates=["Date"]).sort_values(by="Date")
X_test = torch.DoubleTensor(df["Close"].values).to(device=device, dtype=dtype)
X_test = torch.unsqueeze(torch.unsqueeze(X_test, dim=0), dim=-1)

state_dict = torch.load(state_dict_path)
rnn.load_state_dict(state_dict)

In [None]:
out = rnn.predict(X_test, window=10)

In [None]:
# to make dates for the predictions
# TODO: init_date is not enough in case there is NaN values.
# How to handle precisely NaN values ? 
# I believe we should make an average with the one before and the one after.
# See : http://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
# Remember : data is perfect, not my job to perform cleaning...
def make_dates(init_date, length_preds):
    dates = []
    dt = datetime.timedelta(days=1.)
    date = init_date
    while(len(dates) < length_preds):
        date += dt
        if(date.isoweekday() <= 5):
            dates.append(date)
    return dates

In [None]:
preds = torch.cat([out[1], out[0][:, X_test.shape[1]:]], dim=1).cpu().numpy()[0, :, 0]
dates = make_dates(df["Date"].iloc[0], len(preds))

In [None]:
# plotting truth vs predictions
fig, ax = plt.subplots()
sns.lineplot(data=df, x="Date", y="Close", label="true")
sns.lineplot(x=dates, y=preds, label="pred")
plt.xticks(rotation=45);

## Regression Models

### Data

In [None]:
df = pd.DataFrame({stock: df_[pynance.utils.conventions.close_name] for stock, df_ in dict_stocks.items()}) 
df[pynance.utils.conventions.date_name] = df.index
# TODO: check if nan are handle correctly (in theory yes)

In [None]:
data_creator = pynance.utils.datasets.creators.StockValueRegressionDatasetCreator(df, df, market=market)

In [None]:
x_train, x_valid, y_train, y_valid = data_creator.get_train_sets(ratio=0.8, return_type="numpy")

In [None]:
regs = {}
for i, stock in enumerate(x[:-1]):
    reg = linear_model.LinearRegression()
    reg = reg.fit(x_train, y_train[:, i:i+1])
    regs[stock] = reg
    score = reg.score(x_valid, y_valid[:, i:i+1])
    print(f"{stock} : {score}")


In [None]:
dates = data_creator._test_data["Date"]
x_test, y_test = data_creator.get_test_set(return_type="numpy")

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(12,10), constrained_layout=True)
ax = ax.flatten()
for i, (stock, reg) in enumerate(regs.items()):
    preds = reg.predict(x_test)
    sns.lineplot(x=dates, y=np.squeeze(y_test[:, i:i+1]), label="truth", ax=ax[i])
    sns.lineplot(x=dates, y=np.squeeze(preds), label="pred", ax=ax[i])
    ax[i].set_title(stock)
    ax[i].tick_params(labelrotation=45)
