In [1]:
# Word cloud
from collections import Counter
import re

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# nltk.download('stopwords', quiet=True) # run only once
# nltk.download('punkt_tab', quiet=True) # run only once

# General
from tqdm.auto import tqdm
import pandas as pd
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
import warnings

# for selecting current directory
import os
os.chdir('/home/denisalpino/dev/FinABYSS')

In [2]:
warnings.simplefilter("ignore", Warning)

if not os.path.exists("notebooks/data_preprocessing/img"):
    os.mkdir("notebooks/data_preprocessing/img")
if not os.path.exists("notebooks/data_preprocessing/img/light"):
    os.mkdir("notebooks/data_preprocessing/img/light")
if not os.path.exists("notebooks/data_preprocessing/img/dark"):
    os.mkdir("notebooks/data_preprocessing/img/dark")

In [3]:
df = pd.read_parquet("data/preprocessed/articles.parquet", columns=["title", "source", "assets", "datetime"])

#### **Data Vizualization**

#### **Distribution of articls by date (all articles | all after 18.03.2025 | only Nvidia)**

In [4]:
def show_dist_by_dates(ser, title, template="plotly") -> go.Figure:
    fig = go.Figure(
        data=[go.Histogram(
            x=ser,
            xbins=dict(
                size=86400000,
            )
        )]
    )
    fig.update_layout(
        title=dict(text=title, font=dict(size=26)),
        title_x=0.5,
        xaxis=dict(
            title="<b>Date</b>",
            title_font=dict(size=22),
            tickfont=dict(size=18),
        ),
        yaxis=dict(
            title="<b>Quantity</b>",
            title_font=dict(size=22),
            tickfont=dict(size=18),
            showgrid=False
        ),
        bargap=0.2,
        template=template,
        height=600
    )
    return fig

##### **All articles**

In [5]:
fig = show_dist_by_dates(df["datetime"], "<b>Distribution of articles by dates</b>")
fig.write_image("notebooks/data_preprocessing/img/light/articles_dist_by_dates.png", height=700, width=2000)
fig.update_layout(template="plotly_dark")
fig.write_image("notebooks/data_preprocessing/img/dark/articles_dist_by_dates.png", height=700, width=2000)

##### **All articles after 18.03.2025**

In [6]:
fig = show_dist_by_dates(df[df["datetime"] >= "2025-03-19 00:00:00+00:00"]["datetime"], title="<b>Distribution of articles by dates (tail)</b>")
fig.write_image("notebooks/data_preprocessing/img/light/articles_dist_by_dates_tail.png", height=700, width=2000)
fig.update_layout(template="plotly_dark")
fig.write_image("notebooks/data_preprocessing/img/dark/articles_dist_by_dates_tail.png", height=700, width=2000)

##### **Only Nvidia**

In [7]:
fig = show_dist_by_dates(df[df.assets.apply(lambda x: "NVDA" in x)]["datetime"], "<b>Distribution of articles related to Nvidia by dates</b>")
fig.write_image("notebooks/data_preprocessing/img/light/articles_dist_by_dates_nvidia.png", height=700, width=2000)
fig.update_layout(template="plotly_dark")
fig.write_image("notebooks/data_preprocessing/img/dark/articles_dist_by_dates_nvidia.png", height=700, width=2000)

##### **Popular assets**

In [8]:
assets = pd.Series([asset for assets in df.assets.to_numpy() if assets.size for asset in assets]).value_counts()

fig = px.bar(
    y=assets.values[:30],
    x=assets.index[:30],
    text=assets.values[:30],
    template="plotly",
    height=600
)
fig.update_traces(
    texttemplate='%{text}',
    textposition='outside',
    textfont_size=18
)
fig.update_layout(
    title=dict(text="<b>Top-30 most popular tickers<b>", font=dict(size=24)),
    title_x=0.5,
    xaxis=dict(
        title="<b>Ticker</b>",
        title_font=dict(size=22),
        tickfont=dict(size=18),
        tickangle=45
    ),
    yaxis=dict(
        title="<b>Frequency</b>",
        title_font=dict(size=22),
        tickfont=dict(size=18),
        showgrid=False
    )
)
fig.write_image("notebooks/data_preprocessing/img/light/top30_tickers.png", height=700, width=2000)
fig.update_layout(template="plotly_dark")
fig.write_image("notebooks/data_preprocessing/img/dark/top30_tickers.png", height=700, width=2000)

##### **Popular sources**

In [9]:
source = df.source.value_counts()

fig = px.bar(
    y=source.values[:30],
    x=source.index[:30],
    text=source.values[:30],  # Добавляем значения для отображения
    template="plotly",
    height=600
)
fig.update_traces(
    texttemplate='%{text}',
    textposition='outside',
    textfont_size=18
)
fig.update_layout(
    title=dict(text="<b>Top-30 most popular sources of articles<b>", font=dict(size=24)),
    title_x=0.5,
    xaxis=dict(
        title="<b>Source</b>",
        title_font=dict(size=22),
        tickfont=dict(size=18),
        tickangle=45
    ),
    yaxis=dict(
        title="<b>Frequency</b>",
        title_font=dict(size=22),
        tickfont=dict(size=18),
        showgrid=False
    )
)
fig.write_image("notebooks/data_preprocessing/img/light/top30_sources.png", height=700, width=2000)
fig.update_layout(template="plotly_dark")
fig.write_image("notebooks/data_preprocessing/img/dark/top30_sources.png", height=700, width=2000)

#### **Word Cloud**

##### **Load all text**

In [11]:
# df = pd.read_parquet("data/raw/articles.parquet", columns=["text"], use_pyarrow=True) # 15,4 | 8,6 GB
df = pl.read_parquet("data/preprocessed/articles.parquet", columns=["text"], n_rows=250_000) # 5,9 | 2,0 GB

##### **Prepare variables**

In [17]:
LANG = "english"
MIN_WORD_LEN = 4
TOP_WORDS = 200

stop_words = set(stopwords.words(LANG)) | {"www", "com", "http", "https", "2022", "2023", "2024", "2025"}

plotly_dark_palette = mcolors.ListedColormap([
    "#2e91ff", "#da16ff", "#eb663b"
])

##### **Process text using batches**

In [18]:
def process_text(text: str) -> list[str]:
    if not text:
        return []
    text = re.sub(r"[^\w\s]", "", text.lower())
    tokens = word_tokenize(text)
    return [
        word for word in tokens
        if len(word) >= MIN_WORD_LEN
        and word not in stop_words
    ]

In [19]:
batch_size = 25_000
total_counts = Counter()

for batch in tqdm(df.iter_slices(n_rows=batch_size), desc="Text batches"):
    batch_counts = (
        batch.lazy()
        .with_columns(
            pl.col("text")
            .map_elements(process_text, return_dtype=pl.List(pl.Utf8))
            .alias("words")
        )
        .explode("words")
        .filter(pl.col("words").is_not_null())
        .group_by("words")
        .agg(pl.count().alias("count"))
        .sort("count", descending=True)
        .head(TOP_WORDS)
        .collect()
        .to_pandas()
        .set_index("words")
        ["count"]
        .to_dict()
    )
    total_counts.update(batch_counts)

Text batches: 0it [00:00, ?it/s]

##### **Dark version**

In [24]:
wordcloud = WordCloud(
    width=2000,
    height=1000,
    background_color='#1a1a1a',
    colormap=plotly_dark_palette,
    collocations=False,
    max_words=TOP_WORDS,
).generate_from_frequencies(total_counts)

plt.figure(figsize=(20, 10), facecolor='#1a1a1a')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig(
    "notebooks/data_preprocessing/img/dark/wordcloud.png",
    bbox_inches='tight',
    pad_inches=0,
    edgecolor='none',
    dpi=300,
    facecolor='#1a1a1a'
)
plt.close()

##### **Light version**

In [23]:
wordcloud = WordCloud(
    width=2000,
    height=1000,
    background_color='white',
    colormap=plotly_dark_palette,
    collocations=False,
    max_words=TOP_WORDS,
).generate_from_frequencies(total_counts)

plt.figure(figsize=(20, 10), facecolor='white')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig(
    "notebooks/data_preprocessing/img/light/wordcloud.png",
    bbox_inches='tight',
    pad_inches=0,
    edgecolor='none',
    dpi=300,
    facecolor='#1a1a1a'
)
plt.close()