In [None]:
# Initialize Git if not already initialized
#!git init

# Initialize DVC
!dvc init

In [None]:
!dvc add data/raw/train.csv

In [None]:
!git add --all
!git commit -m "First commit with setup and DVC files"
#!dvc push -r origin

In [None]:
!git push

In [None]:
!python src/prepare.py

In [None]:
!dvc add data/prepared/train.csv data/prepared/test.csv
!git add --all
!git commit -m "Created train and test CSV files"

In [None]:
!python src/train.py

In [None]:
!dvc add model/model.joblib
!git add --all
!git commit -m "First training"

In [None]:
!python src/evaluate.py

In [None]:
!git add --all
!git commit -m "Evaluate the model accuracy"

In [None]:
!git push
#!dvc push -r origin

In [None]:
!git checkout -b reproducible-pipeline
!dvc remove data/prepared/train.csv.dvc
!dvc remove data/prepared/test.csv.dvc
!dvc remove model/model.joblib.dvc

In [None]:
# Prepare stage
!dvc stage add -n prepare \
  -d src/prepare.py -d data/raw \
  -o data/prepared/train.csv -o data/prepared/test.csv \
  python src/prepare.py

# Train stage
!dvc stage add -n train \
  -d src/train.py \
  -d data/prepared/train.csv \
  -p train.model,train.C,train.max_iter,train.n_estimators \
  -o model/model.joblib \
  python src/train.py

# Evaluate stage
!dvc stage add -n evaluate \
  -d src/evaluate.py -d model/model.joblib \
  -M metrics/accuracy.json \
  python src/evaluate.py

In [None]:
!git add dvc.yaml
!git commit -m "Added DVC pipeline"

In [None]:
!git push --set-upstream origin reproducible-pipeline

In [None]:
!git checkout -b random-forest

In [None]:
!dvc repro

In [None]:
!dvc metrics show

In [None]:
!git add dvc.yaml dvc.lock metrics/accuracy.json params.yaml
!git commit -m "Try random forest with n_estimators: 100"
!git push --set-upstream origin random-forest

In [None]:
!git checkout -b logistic-regression

In [None]:
from pathlib import Path
import yaml

# Load params.yaml
params_path = Path("params.yaml")
params = yaml.safe_load(params_path.read_text())

# Change model to logistic
params["train"]["model"] = "logistic"

# Optionally change hyperparameters too
params["train"]["C"] = 1.0
params["train"]["max_iter"] = 100

# Save the updated params.yaml
params_path.write_text(yaml.dump(params, sort_keys=False))

print("Updated params.yaml to use Logistic Regression")

In [None]:
# Check what changed in pipeline
!dvc status

In [None]:
# Re-run training + evaluation stages
!dvc repro evaluate

In [None]:
# Show metrics
!dvc metrics show -T

In [None]:
!git add dvc.yaml dvc.lock metrics/accuracy.json params.yaml
!git commit -m "Try logistic regression"
!git push --set-upstream origin logistic-regression

In [None]:
!dvc dag