--- 
# Sandbox (Ambroise)
Tests for each part of this project

---

### Imports

In [27]:
%load_ext autoreload
%autoreload 2

#external libraries
import polars as pl
import pandas as pd
from datetime import date
import nest_asyncio
import os
import yfinance as yf
import datetime
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score


#internal libraries
from prompts import process_in_batches, import_data, preprocess_responses

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Importing data test

In [None]:
titles = import_data()


In [None]:
titles.head()

## Prompting test

### Basic sentences
Adapted from the mistral website

In [None]:
responses = preprocess_responses()

In [None]:
# responses.write_csv("scores.csv")

In [None]:
# testing on half of the dataset first
titles = titles.filter(pl.col('date').lt(date(2024,1,1)))

In [None]:
# nest_asyncio.apply()
# prompts = titles.select('prompts').to_series().to_list()
# csv_path = "responses_2024_2025.csv"
# first_write = not os.path.exists(csv_path)

# async for batch in process_in_batches(prompts, batch_size=200, concurrency=10):
#     batch_df = pl.DataFrame({
#         "index": [r[0] for r in batch],
#         "prompt": [r[1] for r in batch],
#         "response": [r[2] for r in batch],
#     })

#     csv_string = batch_df.write_csv(separator=",", include_header=first_write)
#     with open(csv_path, "a", encoding="utf-8") as f:
#         f.write(csv_string)

#     first_write = False
#     print(f"Wrote {len(batch)} rows to {csv_path}")


In [None]:
pl.read_csv('responses_2024_2025.csv').select(pl.col('response').str.slice(0,10))

In [3]:
scores = pl.read_csv('scores.csv')

In [6]:
scores.head()

date,increase,decrease,diff,score
str,i64,i64,i64,f64
"""2017-09-29""",0,1,-1,-1.0
"""2017-10-31""",2,0,2,1.0
"""2017-11-12""",1,0,1,1.0
"""2017-11-13""",0,0,0,
"""2017-11-14""",0,1,-1,-1.0


In [4]:
scores = pl.read_csv(
  "scores.csv",
  dtypes={"date": pl.Date}
)


  scores = pl.read_csv(


In [None]:
min_date = scores["date"].min()
max_date = scores["date"].max()

start_str = min_date.strftime("%Y-%m-%d")
end_str   = (max_date + datetime.timedelta(days=1)).strftime("%Y-%m-%d")

df_pd = yf.download(
    tickers="BTC-USD",
    start=start_str,
    end=end_str,
    interval="1d",
    progress=False
)


if isinstance(df_pd.columns, pd.MultiIndex):
    df_pd.columns = df_pd.columns.get_level_values(0)

df_pd = df_pd.reset_index()
df_price = (
    pl.from_pandas(df_pd)
      .select([
         pl.col("Date").alias("date").cast(pl.Date),
         pl.col("Close").alias("close")
      ])
)

print(df_price.head())


shape: (5, 2)
┌────────────┬─────────────┐
│ date       ┆ close       │
│ ---        ┆ ---         │
│ date       ┆ f64         │
╞════════════╪═════════════╡
│ 2017-09-29 ┆ 4163.069824 │
│ 2017-09-30 ┆ 4338.709961 │
│ 2017-10-01 ┆ 4403.740234 │
│ 2017-10-02 ┆ 4409.319824 │
│ 2017-10-03 ┆ 4317.47998  │
└────────────┴─────────────┘


In [None]:
# for each date, if next date is higher : add 1, else 0
df_price = df_price.with_columns([
    (pl.col("close").shift(-1) > pl.col("close")).cast(pl.Int8).alias("actual")
])

In [33]:
df_price

date,close,actual
date,f64,i8
2017-09-29,4163.069824,1
2017-09-30,4338.709961,1
2017-10-01,4403.740234,1
2017-10-02,4409.319824,0
2017-10-03,4317.47998,0
…,…,…
2025-01-03,98107.429688,1
2025-01-04,98236.226562,1
2025-01-05,98314.960938,1
2025-01-06,102078.085938,0


In [None]:
df = scores.join(df_price.select(["date","actual"]), on="date", how="inner")

In [None]:
df = df.with_columns(
    (pl.col("score") > 0).cast(pl.Int8).alias("predicted")
)

In [32]:
print(df.head(50))

shape: (50, 7)
┌────────────┬──────────┬──────────┬──────┬───────┬────────┬───────────┐
│ date       ┆ increase ┆ decrease ┆ diff ┆ score ┆ actual ┆ predicted │
│ ---        ┆ ---      ┆ ---      ┆ ---  ┆ ---   ┆ ---    ┆ ---       │
│ date       ┆ i64      ┆ i64      ┆ i64  ┆ f64   ┆ i8     ┆ i8        │
╞════════════╪══════════╪══════════╪══════╪═══════╪════════╪═══════════╡
│ 2017-09-29 ┆ 0        ┆ 1        ┆ -1   ┆ -1.0  ┆ 1      ┆ 0         │
│ 2017-10-31 ┆ 2        ┆ 0        ┆ 2    ┆ 1.0   ┆ 1      ┆ 1         │
│ 2017-11-12 ┆ 1        ┆ 0        ┆ 1    ┆ 1.0   ┆ 1      ┆ 1         │
│ 2017-11-13 ┆ 0        ┆ 0        ┆ 0    ┆ NaN   ┆ 1      ┆ 1         │
│ 2017-11-14 ┆ 0        ┆ 1        ┆ -1   ┆ -1.0  ┆ 1      ┆ 0         │
│ …          ┆ …        ┆ …        ┆ …    ┆ …     ┆ …      ┆ …         │
│ 2018-04-17 ┆ 1        ┆ 0        ┆ 1    ┆ 1.0   ┆ 1      ┆ 1         │
│ 2018-04-19 ┆ 1        ┆ 0        ┆ 1    ┆ 1.0   ┆ 1      ┆ 1         │
│ 2018-04-24 ┆ 1        ┆ 0        ┆

In [None]:
df_clean = df.filter(
    pl.col("actual").is_not_null() &
    pl.col("predicted").is_not_null()
)

pdf = df_clean.select(["actual", "predicted"]).to_pandas()

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

acc  = accuracy_score(pdf.actual, pdf.predicted)
cm   = confusion_matrix(pdf.actual, pdf.predicted)
prec = precision_score(pdf.actual, pdf.predicted)
rec  = recall_score(pdf.actual, pdf.predicted)

print(f"Accuracy       : {acc:.3f}")
print(f"Confusion mat. :\n{cm}")
print(f"Precision      : {prec:.3f}")
print(f"Recall         : {rec:.3f}")


Accuracy       : 0.502
Confusion mat. :
[[122 667]
 [138 689]]
Precision      : 0.508
Recall         : 0.833
