--- 
# Sandbox (Faune)
Tests for each part of this project

---

### Imports

In [1]:
#external libraries
import polars as pl
from datetime import date
import nest_asyncio
import os

#internal libraries
from prompts import process_in_batches, import_data

## Importing data test

In [2]:
titles = import_data()

## Prompting test

### Basic sentences
Adapted from the mistral website

In [3]:
# testing on half of the dataset first
titles = titles.filter(pl.col('date').ge(date(2024,1,1)))

In [4]:
titles

date,title,prompts
date,str,str
2024-01-01,"""Orbit Chain Becomes Victim of …","""  Here is a piece of news: …"
2024-01-01,"""Crypto Analyst Says History Sh…","""  Here is a piece of news: …"
2024-01-01,"""4 altcoins to buy under $1 for…","""  Here is a piece of news: …"
2024-01-01,"""Bitcoin Fork Linked to Self-Pr…","""  Here is a piece of news: …"
2024-01-01,"""Cryptocurrency Bitcoin SV's Pr…","""  Here is a piece of news: …"
…,…,…
2025-01-07,"""Bitcoin Drop Triggers $206M Li…","""  Here is a piece of news: …"
2025-01-07,"""Michael Saylor’s Bold Plan: Bu…","""  Here is a piece of news: …"
2025-01-07,"""Bitcoin (BTC) Decoupling From …","""  Here is a piece of news: …"
2025-01-07,"""GoMining Expands to Solana wit…","""  Here is a piece of news: …"


In [5]:
nest_asyncio.apply()
prompts = titles.select('prompts').to_series().to_list()
csv_path = "responses_2024_2025.csv"
first_write = not os.path.exists(csv_path)

async for batch in process_in_batches(prompts, batch_size=200, concurrency=10):
    batch_df = pl.DataFrame({
        "index": [r[0] for r in batch],
        "prompt": [r[1] for r in batch],
        "response": [r[2] for r in batch],
    })

    csv_string = batch_df.write_csv(separator=",", include_header=first_write)
    with open(csv_path, "a", encoding="utf-8") as f:
        f.write(csv_string)

    first_write = False
    print(f"Wrote {len(batch)} rows to {csv_path}")


Processing prompts 0 to 199...
Wrote 200 rows to responses_2024_2025.csv
Processing prompts 200 to 399...
Wrote 200 rows to responses_2024_2025.csv
Processing prompts 400 to 599...
Wrote 200 rows to responses_2024_2025.csv
Processing prompts 600 to 799...
Wrote 200 rows to responses_2024_2025.csv
Processing prompts 800 to 999...
Wrote 200 rows to responses_2024_2025.csv
Processing prompts 1000 to 1199...
Wrote 200 rows to responses_2024_2025.csv
Processing prompts 1200 to 1399...
Wrote 200 rows to responses_2024_2025.csv
Processing prompts 1400 to 1599...
Wrote 200 rows to responses_2024_2025.csv
Processing prompts 1600 to 1799...
Wrote 200 rows to responses_2024_2025.csv
Processing prompts 1800 to 1999...
Wrote 200 rows to responses_2024_2025.csv
Processing prompts 2000 to 2199...
Wrote 200 rows to responses_2024_2025.csv
Processing prompts 2200 to 2399...
Wrote 200 rows to responses_2024_2025.csv
Processing prompts 2400 to 2599...
Wrote 200 rows to responses_2024_2025.csv
Processing 

In [10]:
responses = (
    pl.read_csv('responses_2024_2025.csv')
    .with_columns(
        pl.when(pl.col('response').str.slice(0,10).str.to_lowercase().str.contains('increase')).then(1).otherwise(0).alias('increase'),
        pl.when(pl.col('response').str.slice(0,10).str.to_lowercase().str.contains('decrease')).then(1).otherwise(0).alias('decrease'),
        pl.when(pl.col('response').str.slice(0,10).str.contains('[ERROR]')).then(1).otherwise(0).alias('error'),
    )
)

In [11]:
to_analyze = pl.concat([responses, titles], how='horizontal').select('date', 'title', 'increase', 'decrease', 'error')

In [19]:
(
    to_analyze
    .select('date', 'increase', 'decrease')
    .with_columns(diff = pl.col('increase').sub(pl.col('decrease')))
    .group_by('date')
    .sum()
    .with_columns(score = pl.col('diff') / (pl.col('increase').add(pl.col('decrease'))))
    .sort('date')
)

date,increase,decrease,diff,score
date,i32,i32,i32,f64
2024-01-01,7,3,4,0.4
2024-01-02,10,1,9,0.818182
2024-01-03,7,8,-1,-0.066667
2024-01-04,6,0,6,1.0
2024-01-05,11,2,9,0.692308
…,…,…,…,…
2025-01-03,10,2,8,0.666667
2025-01-04,6,0,6,1.0
2025-01-05,2,1,1,0.333333
2025-01-06,12,1,11,0.846154


In [12]:
to_analyze.select('date', 'error').group_by('date').sum().describe()

statistic,date,error
str,str,f64
"""count""","""373""",373.0
"""null_count""","""0""",0.0
"""mean""","""2024-07-05 00:00:00""",14.691689
"""std""",,21.385953
"""min""","""2024-01-01""",0.0
"""25%""","""2024-04-03""",0.0
"""50%""","""2024-07-05""",6.0
"""75%""","""2024-10-06""",24.0
"""max""","""2025-01-07""",174.0


In [9]:
pl.read_csv('responses_2024_2025.csv').select(pl.col('response').str.slice(0,10))

response
str
"""{decrease}"""
"""{increase}"""
"""{increase}"""
"""{increase}"""
"""{uncertain"""
…
"""{decrease}"""
"""{increase:"""
"""[ERROR] AP"""
"""[ERROR] AP"""
