In [None]:
import ast
import logging
import kagglehub
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(levelname)s][%(name)s]: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)

In [None]:
load_dotenv()
RANDOM_STATE = 2026
REGISTERED_MODEL_NAME = "TFIDF_Logistic_Regression"

param_grid = {
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__max_df": [0.8, 0.9],
    "tfidf__min_df": [5, 10],
    "clf__max_iter": [100, 500]
}

X_train, X_test, y_train, y_test = data_preparation(
    dataset="oliviervha/crypto-news",
    file_name="cryptonews.csv",
    random_state=RANDOM_STATE
)

[2026-02-02 02:48:46][INFO][data]: File 'cryptonews.csv' has been loaded with shape (31037, 7)
[2026-02-02 02:48:46][INFO][data]: Class distribution (count):
class
positive    13964
neutral     10554
negative     6517
[2026-02-02 02:48:46][INFO][data]: Class distribution (ratio):
class
positive    0.449944
neutral     0.340068
negative    0.209989
[2026-02-02 02:48:47][INFO][data]: Data preparation is complete


In [None]:
type(X_test)

26653    EOS Launches EVM Support in Attempt to Revital...
14727    Bitcoin logo imperfection found on original ar...
21816    Robinhood lands steep 60% discount on $170M ex...
30638    Bitcoin, Ethereum & Crypto Co See Mixed Perfor...
27560    WisdomTree Launches Three New Crypto ETPs Trac...
                               ...                        
1602     Sushi to test Bitcoin swaps and Opyn DeFi prot...
18330    Bitcoin miner Canaan scales operations despite...
3381     Australia open to idea of CBDC as future of mo...
15434    Polkadot, Kusama Witness Notable Surge in Deve...
1058     KyberSwap’s $47 Million Hacker to Propose Deal...
Name: title_text, Length: 23276, dtype: object

In [7]:
pd.DataFrame({"title_text": X_test})

Unnamed: 0,title_text
1815,Opyn DeFi protocol founders are leaving crypto...
6704,Ethereum Reddit Token MOONS Soars on Kraken Af...
28416,Satoshi-Era Bitcoin Address With $20 Million W...
20538,Are there too many cryptocurrencies?. Are ther...
15155,Crypto Surges as Fed Recognizes Disinflation -...
...,...
11209,Over 75% of Daily Bitcoin On-Chain Transaction...
15834,Polygon enlists Xternity to migrate multiplaye...
9928,Is it Too Late to Buy Cardano? ADA Price Spike...
23574,BlockFi Provides Details on Potential FTX Purc...


In [None]:
model_workflow(
    experiment_name="Sentiment_Logistic_Regression",
    run_name_prefix="logreg_gridsearch",
    Classifier=logistic_regression,
    registered_model_name=REGISTERED_MODEL_NAME,
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    param_grid=param_grid,
    random_state=RANDOM_STATE
)

In [9]:
pd.DataFrame({"title_text": ["Bitcoin price surges after ETF approval"]})

Unnamed: 0,title_text
0,Bitcoin price surges after ETF approval


In [10]:
MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5000")
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

print("MLFLOW_TRACKING_URI =", MLFLOW_TRACKING_URI)


MLFLOW_TRACKING_URI = http://localhost:5000


In [11]:
REGISTERED_MODEL_NAME = "TFIDF_Logistic_Regression"
model_uri = f"models:/{REGISTERED_MODEL_NAME}@production"

model = mlflow.pyfunc.load_model(model_uri)
print("Loaded model from:", model_uri)

Loaded model from: models:/TFIDF_Logistic_Regression@production


In [16]:
model.metadata.run_id

'2eeb74021c95428c98c4d595fa4567b2'

In [17]:
text = "Bitcoin price surges after ETF approval"
X = pd.DataFrame({"title_text": [text]})
X

Unnamed: 0,title_text
0,Bitcoin price surges after ETF approval


In [18]:
pred = model.predict(X)
pred

array(['neutral'], dtype=object)

In [20]:
texts = [
    "Bitcoin price surges after ETF approval",
    "Crypto market crashes amid regulatory concerns",
    "Investors remain cautious as volatility continues"
]
X_batch = pd.DataFrame({"title_text": texts})
X_batch

Unnamed: 0,title_text
0,Bitcoin price surges after ETF approval
1,Crypto market crashes amid regulatory concerns
2,Investors remain cautious as volatility continues


In [19]:
preds = model.predict(X_batch)
list(preds)

['neutral']

In [21]:
model

mlflow.pyfunc.loaded_model:
  artifact_path: /Users/wennanshi/VScodeProjects/Text_Sentiment_Classification/mlartifacts/1/models/m-478e18790b6d4b0f86108a4cc02803b3/artifacts
  flavor: mlflow.sklearn
  run_id: 2eeb74021c95428c98c4d595fa4567b2

In [24]:
preds = model.predict(X_batch)

print("type(preds):", type(preds))
print("preds repr:", repr(preds))
print("preds:", preds)

# 尝试取长度 / shape
try:
    print("len(preds):", len(preds))
except Exception as e:
    print("len(preds) error:", e)

print("shape:", getattr(preds, "shape", None))



type(preds): <class 'numpy.ndarray'>
preds repr: array(['neutral'], dtype=object)
preds: ['neutral']
len(preds): 1
shape: (1,)


In [25]:
import mlflow.sklearn

preds_sk = mlflow.sklearn.load_model(model_uri).predict(X_batch["title_text"])
print(preds_sk)
print("len:", len(preds_sk), "shape:", getattr(preds_sk, "shape", None))


['neutral' 'neutral' 'neutral']
len: 3 shape: (3,)


In [49]:
import mlflow.sklearn
import pandas as pd

model_uri = "models:/TFIDF_Logistic_Regression@production"
sk_model = mlflow.sklearn.load_model(model_uri)

texts = [
    "Bitcoin price surges after ETF approval",
    "Crypto market crashes amid regulatory concerns",
    "Investors remain cautious as volatility continues"
]

preds = sk_model.predict(pd.Series(texts))
print(preds, len(preds))


['neutral' 'neutral' 'neutral'] 3


In [50]:
text = "Bitcoin price surges after ETF approval"
[text]

['Bitcoin price surges after ETF approval']

In [56]:
sk_model.predict(pd.Series([text]))

array(['neutral'], dtype=object)

In [39]:
pd.Series(texts)

0              Bitcoin price surges after ETF approval
1       Crypto market crashes amid regulatory concerns
2    Investors remain cautious as volatility continues
dtype: object

In [31]:
py_model = mlflow.pyfunc.load_model("models:/TFIDF_Logistic_Regression@production")

print("signature:", py_model.metadata.signature)
print("run_id:", py_model.metadata.run_id)



signature: inputs: 
  ['title_text': string (required)]
outputs: 
  [Tensor('object', (-1,))]
params: 
  None

run_id: 2eeb74021c95428c98c4d595fa4567b2
