In [1]:
import os
import mlflow
import logging
import pandas as pd
from dotenv import load_dotenv

from data import get_clean_data, data_preparation
from model import logistic_regression
from mlops import model_workflow

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(levelname)s][%(name)s]: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)

In [None]:
load_dotenv()
RANDOM_STATE = 2026
REGISTERED_MODEL_NAME = "TFIDF_Logistic_Regression"

param_grid = {
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__max_df": [0.8, 0.9],
    "tfidf__min_df": [5, 10],
    "clf__max_iter": [100, 500]
}

X_train, X_test, y_train, y_test = data_preparation(
    dataset="oliviervha/crypto-news",
    file_name="cryptonews.csv",
    random_state=RANDOM_STATE
)

In [None]:
model_workflow(
    experiment_name="Sentiment_Logistic_Regression",
    run_name_prefix="logreg_gridsearch",
    Classifier=logistic_regression,
    registered_model_name=REGISTERED_MODEL_NAME,
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    param_grid=param_grid,
    random_state=RANDOM_STATE
)

In [18]:
df = get_clean_data("oliviervha/crypto-news", "cryptonews.csv")

[2026-02-01 13:24:22][INFO][data]: File 'cryptonews.csv' has been loaded with shape (31037, 7)
[2026-02-01 13:24:23][INFO][data]: Class distribution (count):
class
positive    13964
neutral     10554
negative     6517
[2026-02-01 13:24:23][INFO][data]: Class distribution (ratio):
class
positive    0.449944
neutral     0.340068
negative    0.209989


In [19]:
df["source"].unique()

array(['CryptoNews', 'CoinTelegraph', 'CryptoPotato'], dtype=object)

In [20]:
import pandas as pd
import streamlit as st
import plotly.express as px
from data import get_clean_data

In [26]:
# Load clean data
df = get_clean_data("oliviervha/crypto-news", "cryptonews.csv")

# Sidebar settings
st.sidebar.header("Filters")

# Date range
min_date = df["date"].min().date()
max_date = df["date"].max().date()

date_range = st.sidebar.date_input(
    label="Date range",
    value=(min_date, max_date),
    min_value=min_date,
    max_value=max_date
)

# Time granularity
freq = st.sidebar.selectbox(
    label="Time granularity",
    options=["Daily", "Weekly", "Monthly", "Yearly"],
    index=0
)

freq_map = {
    "Daily": "D",
    "Weekly": "W",
    "Monthly": "ME",
    "Yearly": "YE"
}

# Source & Subject
sources = st.sidebar.multiselect("Source", sorted(df["source"].unique()))
subjects = st.sidebar.multiselect("Subject", sorted(df["subject"].unique()))

# Trends
st.sidebar.header("Trends")

trend_metric = st.sidebar.radio(
    label="Trend metric",
    options=[
        "Share (Stacked)",
        "Count (Stacked)",
        "Polarity mean (Line)",
        "Subjectivity mean (Line)"
    ],
    index=0
)

[2026-02-01 13:25:13][INFO][data]: File 'cryptonews.csv' has been loaded with shape (31037, 7)
[2026-02-01 13:25:14][INFO][data]: Class distribution (count):
class
positive    13964
neutral     10554
negative     6517
[2026-02-01 13:25:14][INFO][data]: Class distribution (ratio):
class
positive    0.449944
neutral     0.340068
negative    0.209989
2026-02-01 13:25:14.722 Session state does not function when running a script without `streamlit run`


In [27]:
sources

[]