In [None]:
import os
import sys
from pathlib import Path

sys.path.append(
    Path.cwd().parents[0].as_posix()
)

In [None]:
import pandas as pd
import polars as pl

from finnews.data import load_and_merge_csv_files

# Defines

In [None]:
root_input_dir = os.path.join("data", "raw")
root_output_dir = os.path.join("data", "processed")

n_symbols = 20
etf_symbols = [
    "PMAY",
]

train_start = "2010-01-01"
val_start = "2023-06-01"
test_start = "2023-09-01"

# Articles

Selecting a sample of stocks and articles:

- Top 30
- From 2010

In [None]:
article_path = os.path.join(root_input_dir, "nasdaq_exteral_data.csv")

In [None]:
dl = pl.scan_csv(article_path)

In [None]:
dl.collect_schema().names()

In [None]:
dl = (
    dl
    .with_columns(
        pl.col("Date")
        .str.strptime(
            pl.Datetime(time_zone="UTC"), 
            "%Y-%m-%d %H:%M:%S %Z", 
            strict=True
        )
        .dt.date()
        .alias("date")
    )
    .with_columns(
        pl.col("Unnamed: 0")
        .cast(pl.Int32)
        .alias("index")
    )
)

In [None]:
daily_counts = (
    dl
    .group_by(
        ["date", "Stock_symbol"]
    )
    .agg(
        count=pl.len()
    )
    .collect(streaming=True)
)

In [None]:
df_daily_counts = daily_counts.to_pandas()

In [None]:
df_sample = df_daily_counts.loc[
    (~df_daily_counts["Stock_symbol"].isin(etf_symbols))
    & (df_daily_counts["date"] >= train_start)
].groupby(
    "Stock_symbol",
    as_index=False
)[["count"]].sum().sort_values(
    by="count",
    ascending=False,
    ignore_index=True
).head(n_symbols)

In [None]:
df_sample["count"].sum()

In [None]:
df_sample

In [None]:
symbols = df_sample["Stock_symbol"].tolist()

In [None]:
dl_sample = dl.filter(
    (pl.col("Stock_symbol").is_in(symbols))
    & (pl.col("date") >= pd.to_datetime(train_start).to_pydatetime())
).select(
    ["index", "date", "Stock_symbol", "Article"]
).rename(
    {
        "Stock_symbol": "symbol",
        "Article": "article"
    }
)

In [None]:
dl_sample.select(["index", "date", "symbol", "article"]).sink_csv(
    os.path.join(root_output_dir, "articles.csv")
)

In [None]:
dl_sample.select(["index", "date", "symbol", "article"]).sink_parquet(
    os.path.join(root_output_dir, "articles.parquet")
)

# Prices

In [None]:
df_prices = load_and_merge_csv_files(
    directory="data/raw/full_history",
    symbols=symbols
)

In [None]:
df_prices.loc[
    df_prices["date"] >= train_start
].reset_index(
    drop=True
).to_parquet(
    os.path.join(root_output_dir, "prices.parquet")
)