In [20]:
from pathlib import Path
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score

import mlflow
import mlflow.sklearn

In [None]:
base_dir = Path.cwd().parent # may need to adjust to get to correct working directory!

data_path = base_dir / "data" / "train.csv"
df = pd.read_csv(data_path)

In [14]:
df = pd.read_csv("C:/Users/angel/OneDrive/Desktop/DU/Advanced_Topics/Final_Project_Build_Production_Grade_MLOps_System/data/train.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [15]:
df['toxic'].value_counts() #binary variable

toxic
0    144277
1     15294
Name: count, dtype: int64

In [16]:
X = df["comment_text"]
y = df["toxic"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000)),
    ('clf', LogisticRegression(max_iter=1000))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.99      0.98     36069
           1       0.92      0.61      0.73      3824

    accuracy                           0.96     39893
   macro avg       0.94      0.80      0.85     39893
weighted avg       0.96      0.96      0.95     39893



In [18]:
mlflow.set_experiment("toxic_comment_moderation")

2025/08/18 20:49:53 INFO mlflow.tracking.fluent: Experiment with name 'toxic_comment_moderation' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/angel/OneDrive/Desktop/DU/Advanced_Topics/Final_Project_Build_Production_Grade_MLOps_System/models/mlruns/829804759602383550', creation_time=1755571793758, experiment_id='829804759602383550', last_update_time=1755571793758, lifecycle_stage='active', name='toxic_comment_moderation', tags={}>

In [21]:
with mlflow.start_run():
    pipeline.fit(X_train, y_train)
    mlflow.sklearn.log_model(pipeline, "model")
    mlflow.log_params({
        "vectorizer": "TFIDF",
        "classifier": "LogisticRegression",
        "max_iter": 1000
    })
    mlflow.log_metrics({
        "accuracy": accuracy_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    })

