<a href="https://colab.research.google.com/github/chaitanyavaleti/Cricsheet_Match_DataAnalytics/blob/main/ML_SmartInsurance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient
from sklearn.model_selection import GridSearchCV

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [7]:
train = pd.read_csv("/content/sample_data/train.csv")


In [8]:
train['Policy Start Date'] = pd.to_datetime(train['Policy Start Date'] )

In [9]:
train['Year'] = train['Policy Start Date'].dt.year

In [10]:
train.drop(['id', 'Policy Start Date'], axis=1, inplace=True)


In [11]:
target = "Premium Amount"

Q1 = train["Premium Amount"].quantile(0.25)
Q3 = train["Premium Amount"].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [12]:
# Keep only rows within bounds
train = train[(train["Premium Amount"] >= lower_bound) & (train["Premium Amount"] <= upper_bound)]

In [13]:
train.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Year
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,Poor,No,Weekly,House,2869.0,2023
1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Average,Yes,Monthly,House,1483.0,2023
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,Good,Yes,Weekly,House,567.0,2023
3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,Poor,Yes,Daily,Apartment,765.0,2024
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,Poor,Yes,Weekly,House,2022.0,2021


In [14]:
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])

# Categorical pipeline: impute → one-hot encode
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

X = train.drop(columns=[target])
y = train[target]

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify column types
num_cols = X.select_dtypes(include=["int64","float64"]).columns
cat_cols = X.select_dtypes(exclude=["int64","float64"]).columns

# Combine numeric + categorical
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_cols),
        ("cat", cat_transformer, cat_cols)
    ]
)

In [16]:
training_columns = X.columns.tolist()
pd.Series(training_columns).to_csv("training_columns.csv", index=False)

In [17]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=15, n_jobs=1, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=500,
      learning_rate=0.01,
      max_depth=8,
      subsample=0.8,
      colsample_bytree=1.0,
      random_state=42)
}

In [18]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred)))


In [19]:
mlflow.set_experiment("Insurance_Premium_Prediction_Pipeline")

best_rmse = float("inf")
best_model = None
best_model_name = None
best_run_id = None

all_results = []

for name, model in models.items():
    with mlflow.start_run(run_name=name) as run:

        # Build pipeline with preprocessing + model
        pipeline = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("model", model)
        ])

        # Train
        pipeline.fit(X_train, y_train)

        # Predict
        y_pred = pipeline.predict(X_val)

        # Infer signature
        signature = infer_signature(X_val, y_pred)

        # Take a small sample as input_example
        input_example = X_val.iloc[:5]

        # Cross-validation RMSE
        cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring="neg_root_mean_squared_error")
        rmse_cv = -cv_scores.mean()

        # Metrics
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        rmsle_val = rmsle(y_val, y_pred)

        # Log parameters & metrics
        mlflow.log_param("model", name)
        if hasattr(model, "get_params"):
            mlflow.log_params(model.get_params())

        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("R2", r2)
        mlflow.log_metric("RMSLE", rmsle_val)
        mlflow.log_metric("RMSE_CV", rmse_cv)

        # Log model (IMPORTANT: log pipeline not raw model)
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            name="model",
            signature=signature,
            input_example=input_example,
        )

        all_results.append({
            "Model": name,
            "RMSE": rmse,
            "RMSLE": rmsle_val,
            "MAE": mae,
            "R2": r2,
            "RMSE_CV": rmse_cv,
            "RunID": run.info.run_id
        })

        print(f"{name}: RMSE={rmse:.2f},  RMSLE={rmsle_val:.4f},   R2={r2:.2f}")

        if rmsle_val < best_rmse:
            best_rmse = rmsle_val
            best_model = model
            best_model_name = name
            best_run_id = run.info.run_id



2025/09/20 05:57:48 INFO mlflow.tracking.fluent: Experiment with name 'Insurance_Premium_Prediction_Pipeline' does not exist. Creating a new experiment.


Linear Regression: RMSE=705.05,  RMSLE=1.1300,   R2=0.01




Decision Tree: RMSE=992.07,  RMSLE=1.4618,   R2=-0.97




Random Forest: RMSE=684.01,  RMSLE=1.0910,   R2=0.06




XGBoost: RMSE=684.66,  RMSLE=1.0940,   R2=0.06


In [None]:
if best_run_id:
    model_uri = f"runs:/{best_run_id}/model"
    result = mlflow.register_model(
        model_uri=model_uri, name="InsurancePremiumPrediction"
    )
    print(f"✅ Best model registered: {best_model_name} with RMSE={best_rmse:.2f} and Run ID={best_run_id}")

     # Promote to Production automatically
    client = MlflowClient()

    # Add tags to indicate Production status
    client.set_model_version_tag(
        name="InsurancePremiumPrediction",
        version=result.version,
        key="stage",
        value="Production"
    )

    client.set_model_version_tag(
        name="InsurancePremiumPrediction",
        version=result.version,
        key="rmse",
        value=str(best_rmse)
    )

    client.set_model_version_tag(
        name="InsurancePremiumPrediction",
        version=result.version,
        key="model_name",
        value=best_model_name
    )


    print(f"Model version {result.version} promoted to PRODUCTION ✅")
else:
   print("❌ No model was registered. Check logs.")

Successfully registered model 'InsurancePremiumPrediction'.


✅ Best model registered: Random Forest with RMSE=1.10 and Run ID=7e64b699a0b94a89a9eff43182b1d09f
Model version 1 promoted to PRODUCTION ✅


Created version '1' of model 'InsurancePremiumPrediction'.


In [1]:
pip install mlflow

Collecting mlflow
  Downloading mlflow-3.4.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.4.0 (from mlflow)
  Downloading mlflow_skinny-3.4.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.4.0 (from mlflow)
  Downloading mlflow_tracing-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting fastmcp<3,>=2.0.0 (from mlflow)
  Downloading fastmcp-2.12.3-py3-none-any.whl.metadata (17 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.4.0->mlflow)
  Downloading databricks_sdk-0.65.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-proto<3,>=1.9.0 (from mlflow-skinny==3.4.0->mlflow)
  Downloading opentelemetry_proto-1.37.0-py3-none-any.w