Finance BERT already handles tokenization, so we don't need to lowercase the text, just trimming whitespaces and removing weird characters

In [17]:
import os
from dotenv import load_dotenv

load_dotenv()

RAW_DATA_FOLDER = os.getenv('RAW_DATA_FOLDER', '/data/raw')
CLEAN_DATA_FOLDER = os.getenv('CLEAN_DATA_FOLDER', '/data/clean')

In [18]:
import polars as pl
from typing import Optional
import re

def light_normalize_text(text: Optional[str]) -> Optional[str]:
    """
    Normalize text by trimming whitespace and removing non-printable characters.
    """

    if not text:
        return text

    # replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text) 

    # remove non-printable characters
    text = ''.join([c for c in text if c.isprintable()])  
    
    # trim leading and trailing whitespace
    text = text.strip()  

    return text

cryptonews_df = pl.read_parquet(f"{CLEAN_DATA_FOLDER}/cryptopanic_news_clean.parquet")

cryptonews_df[["title", "description"]].head(5)

title,description
str,str
"""Crypto Investor Makes Over 3,6…","""An investor has made an incred…"
"""Cardano’s 10% jump sparks hope…","""After a quiet December, 2025 s…"
"""Someone Paid 93 ETH In Fees Fo…","""Data shows someone has today p…"
"""‘Critical’ Polygon bug put $24…","""Polygon was at risk of losing …"
"""Charles Hoskinson reveals ‘con…","""The cryptocurrency market may …"


In [19]:
# Normalize the text columns
cryptonews_light_normalized_df = cryptonews_df.with_columns(
    pl.col("title").map_elements(light_normalize_text, return_dtype=pl.String),
    pl.col("description").map_elements(light_normalize_text, return_dtype=pl.String),
)

cryptonews_light_normalized_df[["title", "description"]].head(5)

title,description
str,str
"""Crypto Investor Makes Over 3,6…","""An investor has made an incred…"
"""Cardano’s 10% jump sparks hope…","""After a quiet December, 2025 s…"
"""Someone Paid 93 ETH In Fees Fo…","""Data shows someone has today p…"
"""‘Critical’ Polygon bug put $24…","""Polygon was at risk of losing …"
"""Charles Hoskinson reveals ‘con…","""The cryptocurrency market may …"


In [20]:
cryptonews_light_normalized_df.write_parquet(f"{CLEAN_DATA_FOLDER}/cryptopanic_news_light_normalized.parquet")