Setup

In [None]:
!pip install pandas matplotlib scikit-learn mlflow

In [None]:
import pandas as pd
import os, json

In [None]:
df = pd.read_csv("./data/salary.csv")
print(df.head())

EDA

In [None]:
import matplotlib.pyplot as plt
from pathlib import Path

ARTIFACTS_DIR = Path("cap_project_assets/artifacts")
eda_dir = ARTIFACTS_DIR / "eda"
eda_dir.mkdir(parents=True, exist_ok=True)

summary = {
    "rows" : int(df.shape[0]),
    "columns": list(df.columns),
    "describe": json.loads(df.describe().to_json())
}

(eda_dir / "summary.json").write_text(json.dumps(summary, indent=2))

plt.figure()
plt.scatter(df["YearsExperience"], df["Salary"])
plt.xlabel("Years Experience")
plt.ylabel("Salary")
plt.title("Salary vs YOE")


Preprocessing of Data

In [None]:
print(summary)

In [None]:

(eda_dir / "summary.json").write_text(json.dumps(summary, indent=2))

In [None]:
import mlflow

mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("capstone_project_demo")

with mlflow.start_run(run_name="eda") as run:
    import matplotlib.pyplot as plt
    plt.figure()
    plt.scatter(df["YearsExperience"], df["Salary"])
    plt.xlabel("Years Experience")
    plt.ylabel("Salary")
    plt.title("Salary vs YOE")
    scatter_path = eda_dir / "scatter.png"#
    print(scatter_path)
    plt.savefig(scatter_path, bbox_inches="tight")
    plt.close()

    mlflow.log_artifact(str(eda_dir / "summary.json"), artifact_path="eda")
    mlflow.log_artifact(str(scatter_path), artifact_path="eda")

mlflow.end_run()



In [None]:
import os
from pathlib import Path
import pandas as pd
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI", "file:./mlruns")
MLFLOW_EXPERIMENT_NAME = os.getenv("MLFLOW_EXPERIMENT_NAME", "salary-regression")
DATA_PATH = os.getenv("DATA_PATH", "data/salary.csv")
TEST_SIZE = float(os.getenv("TEST_SIZE", "0.2"))
RANDOM_STATE = int(os.getenv("RANDOM_STATE", "42"))
REGISTER = os.getenv("REGISTER_MODEL", "false").lower() == "true"
REGISTERED_NAME = os.getenv("REGISTERED_MODEL_NAME", "salary-model")

def eval_split(model, Xs, ys, prefix="test"):
    preds = model.predict(Xs)
    mae = mean_absolute_error(ys, preds)
    rmse = mean_squared_error(ys, preds)
    r2 = r2_score(ys, preds)
    mlflow.log_metrics({f"{prefix}_mae": mae, f"{prefix}_rmse": rmse, f"{prefix}_r2": r2})
    return {"mae": mae, "rmse": rmse, "r2": r2}

def train_and_log(name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run(run_name=f"train-{name}") as run:
        mlflow.log_params({"model_name": name, "test_size": TEST_SIZE, "random_state": RANDOM_STATE})
        if hasattr(model, "get_params"):
            params = model.get_params()
            # keep the most relevant params to avoid huge logs
            keep = {"n_estimators","max_depth","min_samples_split","min_samples_leaf","random_state","criterion"}
            small = {k:v for k,v in params.items() if k in keep}
            if small:
                mlflow.log_params({f"hp_{k}": v for k, v in small.items()})

        model.fit(X_train, y_train)
        _ = eval_split(model, X_train, y_train, "train")
        test_metrics = eval_split(model, X_test, y_test, "test")

        signature = infer_signature(X_train, model.predict(X_train))
        input_example = X_train.head(3)
        logged = mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path=f"model_{name}",
            signature=signature,
            input_example=input_example,
            registered_model_name=(REGISTERED_NAME if REGISTER else None)
        )
        return {
            "name": name,
            "run_id": run.info.run_id,
            "model_uri": logged.model_uri,
            "metrics": test_metrics
        }

def main():
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

    df = pd.read_csv(DATA_PATH)
    X = df[["YearsExperience"]]
    y = df["Salary"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

    models = {
        "linear": Pipeline([("scaler", StandardScaler()), ("regressor", LinearRegression())]),
        "decision_tree": DecisionTreeRegressor(random_state=RANDOM_STATE),
        "random_forest": RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE),
    }

    results = []
    for name, mdl in models.items():
        results.append(train_and_log(name, mdl, X_train, X_test, y_train, y_test))

    best = sorted(results, key=lambda r: r["metrics"]["rmse"])[0]

    with mlflow.start_run(run_name=f"select-best-{best['name']}") as run:
        mlflow.log_params({"selected_model": best["name"]})
        best_model = mlflow.sklearn.load_model(best["model_uri"])
        out_dir = Path("artifacts/model")
        out_dir.parent.mkdir(parents=True, exist_ok=True)
        mlflow.sklearn.save_model(best_model, path=str(out_dir))
        mlflow.sklearn.log_model(
            sk_model=best_model,
            artifact_path="model_best",
            signature=infer_signature(X_train, best_model.predict(X_train)),
            input_example=X_train.head(3),
            registered_model_name=(REGISTERED_NAME if REGISTER else None)
        )
        print("BEST:", best)
        print("Saved best model to:", out_dir)
        print("Logged best model at artifact path: model_best")

if __name__ == "__main__":
    main()

mlflow.end_run()