# Assignment 4: Model Monitoring

In [17]:
!pip install evidently




In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

from evidently import Report
from evidently import Report
from evidently.presets import DataDriftPreset, RegressionPreset


## Import Data and Create Model

In [6]:
data = pd.read_csv("cancer_reg.csv", encoding="latin1")
data.head()

Unnamed: 0,avgAnnCount,avgDeathsPerYear,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,...,PctPrivateCoverageAlone,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate
0,1397.0,469,164.9,489.8,61898,260131,11.2,499.748204,"(61494.5, 125635]",39.3,...,,41.6,32.9,14.0,81.780529,2.594728,4.821857,1.843479,52.856076,6.118831
1,173.0,70,161.3,411.6,48127,43269,18.6,23.111234,"(48021.6, 51046.4]",33.0,...,53.8,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.3725,4.333096
2,102.0,50,174.7,349.7,49348,21026,14.6,47.560164,"(48021.6, 51046.4]",45.0,...,43.5,34.9,42.1,21.1,90.92219,0.739673,0.465898,2.747358,54.444868,3.729488
3,427.0,202,194.8,430.4,44243,75882,17.1,342.637253,"(42724.4, 45201]",42.8,...,40.3,35.0,45.3,25.0,91.744686,0.782626,1.161359,1.362643,51.021514,4.603841
4,57.0,26,144.4,350.1,49955,10321,12.5,0.0,"(48021.6, 51046.4]",48.3,...,43.9,35.1,44.0,22.7,94.104024,0.270192,0.66583,0.492135,54.02746,6.796657


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avgAnnCount              3047 non-null   float64
 1   avgDeathsPerYear         3047 non-null   int64  
 2   TARGET_deathRate         3047 non-null   float64
 3   incidenceRate            3047 non-null   float64
 4   medIncome                3047 non-null   int64  
 5   popEst2015               3047 non-null   int64  
 6   povertyPercent           3047 non-null   float64
 7   studyPerCap              3047 non-null   float64
 8   binnedInc                3047 non-null   object 
 9   MedianAge                3047 non-null   float64
 10  MedianAgeMale            3047 non-null   float64
 11  MedianAgeFemale          3047 non-null   float64
 12  Geography                3047 non-null   object 
 13  AvgHouseholdSize         3047 non-null   float64
 14  PercentMarried          

In [8]:
data = data.drop(columns=[col for col in data.columns if data[col].dtype == 'object'])
data = data.dropna()

In [9]:
target_col = "TARGET_deathRate"
X = data.drop(target_col, axis=1)
y = data[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)
print("Original MSE:", mean_squared_error(y_test, y_pred))
print("Original R2:", r2_score(y_test, y_pred))

Original MSE: 541.1440000252096
Original R2: 0.2368715088250124


## Evidently Reports and Intitial Model Metrics

In [27]:
from evidently import Report
from evidently.presets import DataDriftPreset
from sklearn.metrics import mean_squared_error, r2_score

# Add predictions and targets
baseline_data = X_train.copy()
baseline_data["target"] = y_train
baseline_data["prediction"] = model.predict(X_train)

test_data = X_test.copy()
test_data["target"] = y_test
test_data["prediction"] = model.predict(X_test)

# Generate the report
report = Report(metrics=[DataDriftPreset()])
report.run(reference_data=baseline_data, current_data=test_data)

# Show the summary metrics
print(report)  # prints summary
print("MSE:", mean_squared_error(y_test, test_data["prediction"]))
print("R2 :", r2_score(y_test, test_data["prediction"]))


<evidently.core.report.Report object at 0x7f62b5cde2d0>
MSE: 541.1440000252096
R2 : 0.2368715088250124


In [30]:
!evidently ui

[32mINFO[0m:     Started server process [[36m5456[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Uvicorn running on [1mhttp://127.0.0.1:8000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Shutting down
[32mINFO[0m:     Waiting for application shutdown.
[32mINFO[0m:     Application shutdown complete.
[32mINFO[0m:     Finished server process [[36m5456[0m]


## Scenario Modifications and Evaluation

In [31]:
def evaluate_scenario(name, X_modified):
    # Add predictions and target
    data = X_modified.copy()
    data["target"] = y_test
    data["prediction"] = model.predict(X_modified)

    # Generate drift report
    report = Report(metrics=[DataDriftPreset()])
    report.run(reference_data=baseline_data, current_data=data)

    # Print results
    print(f"\n=== SCENARIO {name} ===")
    print(report)
    print("MSE:", mean_squared_error(y_test, data["prediction"]))
    print("R2 :", r2_score(y_test, data["prediction"]))


# Scenario A
X_test_A = X_test.copy()
X_test_A["medIncome"] = X_test_A["medIncome"] - 40000

# Scenario A+B
X_test_AB = X_test_A.copy()
X_test_AB["povertyPercent"] = X_test_AB["povertyPercent"] + 20

# Scenario A+B+C (only if column exists)
X_test_ABC = X_test_AB.copy()
if "avgHouseholdSize" in X_test_ABC.columns:
    X_test_ABC["avgHouseholdSize"] = X_test_ABC["avgHouseholdSize"] + 2

# Run evaluations
evaluate_scenario("A", X_test_A)
evaluate_scenario("A+B", X_test_AB)
evaluate_scenario("A+B+C", X_test_ABC)



=== SCENARIO A ===
<evidently.core.report.Report object at 0x7f62b5603490>
MSE: 572.3218530672266
R2 : 0.1929040843521891

=== SCENARIO A+B ===
<evidently.core.report.Report object at 0x7f62b5d48e10>
MSE: 582.0083268571423
R2 : 0.179244089733825

=== SCENARIO A+B+C ===
<evidently.core.report.Report object at 0x7f62b5738150>
MSE: 582.0083268571423
R2 : 0.179244089733825
