In [2]:
# src/train.py
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import mlflow
import mlflow.sklearn

# Set paths
base_dir = Path.cwd().parent # may need to adjust to get to correct working directory!

data_path = base_dir / "data" / "train.csv"

# Load data
df = pd.read_csv(data_path)


In [3]:
X = df["comment_text"]
y = df["toxic"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# Create model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000)),
    ('clf', LogisticRegression(max_iter=1000))
])

# Track experiment
mlflow.set_experiment("toxicity-moderation")
with mlflow.start_run():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    # Log metrics and model
    mlflow.log_params({
        "vectorizer": "tfidf",
        "classifier": "logreg",
        "max_features": 10000,
        "max_iter": 1000
    })
    mlflow.log_metrics({
        "accuracy": accuracy_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    })
    mlflow.sklearn.log_model(pipeline, "model")

2025/08/19 19:54:11 INFO mlflow.tracking.fluent: Experiment with name 'toxicity-moderation' does not exist. Creating a new experiment.
