In [None]:
import pandas as pd

from typing import Optional
from typing import List


from evidently import Dataset
from evidently import DataDefinition
from evidently import BinaryClassification, MulticlassClassification, Regression, Recsys
from evidently import Report

from evidently.tests import lte, gte, lt, gt, is_in, not_in, eq, not_eq
from evidently.tests import Reference

from evidently.metrics.group_by import GroupBy
from evidently.generators import ColumnMetricGenerator

from evidently.metrics import ColumnCount
from evidently.metrics import RowCount
from evidently.metrics import DuplicatedRowCount
from evidently.metrics import DuplicatedColumnsCount
from evidently.metrics import EmptyColumnsCount
from evidently.metrics import EmptyRowsCount
from evidently.metrics import DatasetMissingValueCount
from evidently.metrics import AlmostConstantColumnsCount
from evidently.metrics import AlmostDuplicatedColumnsCount
from evidently.metrics import MinValue
from evidently.metrics import MeanValue
from evidently.metrics import MaxValue
from evidently.metrics import MedianValue
from evidently.metrics import StdValue
from evidently.metrics import QuantileValue
from evidently.metrics import CategoryCount
from evidently.metrics import UniqueValueCount
from evidently.metrics import MissingValueCount
from evidently.metrics import InRangeValueCount
from evidently.metrics import OutRangeValueCount
from evidently.metrics import InListValueCount
from evidently.metrics import OutListValueCount
from evidently.metrics import DriftedColumnsCount
from evidently.metrics import ValueDrift
from evidently.metrics import F1Score
from evidently.metrics import Accuracy
from evidently.metrics import Precision
from evidently.metrics import Recall
from evidently.metrics import TPR
from evidently.metrics import TNR
from evidently.metrics import FPR
from evidently.metrics import FNR
from evidently.metrics import LogLoss
from evidently.metrics import RocAuc
from evidently.metrics import F1ByLabel
from evidently.metrics import PrecisionByLabel
from evidently.metrics import RecallByLabel
from evidently.metrics import RocAucByLabel
from evidently.metrics import DummyF1Score
from evidently.metrics import DummyPrecision
from evidently.metrics import DummyRecall
from evidently.metrics import MeanError
from evidently.metrics import MAE
from evidently.metrics import MAPE
from evidently.metrics import RMSE
from evidently.metrics import R2Score
from evidently.metrics import AbsMaxError
from evidently.metrics import DummyMAE
from evidently.metrics import DummyMAPE
from evidently.metrics import DummyRMSE
from evidently.metrics import PrecisionTopK
from evidently.metrics import RecallTopK
from evidently.metrics import FBetaTopK
from evidently.metrics import MAP
from evidently.metrics import MRR
from evidently.metrics import HitRate
from evidently.metrics import NDCG
from evidently.metrics import ScoreDistribution

## Data Quality

In [None]:
reference_data = {
    "Question": [
        "How can I manage stress effectively?",
        "What is the best way to treat a minor burn?",
        "How often should I get a medical check-up?",
        "What foods should I eat to boost my immune system?",
        "What medication should I take for depression without consulting a doctor?"
    ],
    "Answer": [
        "Manage stress by practicing relaxation techniques like deep breathing, exercise, and mindfulness meditation. Check out this link http://localhost.com",
        "For a minor burn, cool the area with running water for 10 minutes and cover it with a sterile dressing.",
        "I'm sorry, I cannot provide specific advice on this question. Please consult a healthcare professional.",
        "I'm sorry, I cannot answer this question. Please seek advice from a licensed healthcare provider.",
        "You could try taking something over-the-counter, like aspirin or ibuprofen, for depression."
    ],
    "Rating":
    [
        5,
        4,
        0,
        1,
        0
    ],
    "Feedback": [
        "Positive",
        "Positive",
        "Negative",
        "Negative",
        "Negative"
    ],
    "Predicted Feedback": [
        "Positive",
        "Negative",
        "Negative",
        "Negative",
        "Positive"
    ],
    "Predicted Probas": [
        0.85,
        0.9,
        0.1,
        0.15,
        0.6, 
    ],
}

reference = pd.DataFrame(reference_data)

In [None]:
current_data = {
    "Question": [
        "What are some effective ways to improve sleep quality?",
        "How can I safely exercise with a knee injury?",
        "Is it safe to take vitamins every day?",
        "What are natural remedies for headaches?",
        "Can I stop taking antibiotics if I feel better?"
    ],
    "Answer": [
        "To improve sleep, maintain a regular sleep schedule, avoid screens before bed, and create a relaxing bedtime routine.",
        "Consult a physiotherapist for exercises that don't strain your knee. Low-impact activities like swimming may help.",
        "I'm sorry, I cannot provide medical advice. Please consult a healthcare provider.",
        "I'm sorry, but I can't give medical advice. Please consult a healthcare professional.",
        "It’s important to complete the full course of antibiotics as prescribed, even if you feel better."
    ],
    "Rating": [
        5,
        4,
        0,
        1,
        3
    ],
    "Feedback": [
        "Positive",
        "Positive",
        "Negative",
        "Negative",
        "Negative"
    ],
    "Predicted Feedback": [
        "Positive",
        "Negative",
        "Negative",
        "Negative",
        "Positive"
    ],
    "Predicted Probas": [
        0.9,
        0.75,
        0.2,
        0.1,
        0.65
    ],
}

current = pd.DataFrame(current_data)

In [None]:
data_report = Report([
    ColumnCount(),
    RowCount(),
    EmptyRowsCount(),
    EmptyColumnsCount(),
    DuplicatedRowCount(),
    DuplicatedColumnsCount(),
    DatasetMissingValueCount(), 
    AlmostConstantColumnsCount(),
    AlmostDuplicatedColumnsCount()
    
])

data_snapshot = data_report.run(current)
data_snapshot

In [None]:
data_report = Report([
    ColumnCount(),
    RowCount(),
    EmptyRowsCount(),
    EmptyColumnsCount(),
    DuplicatedRowCount(),
    DuplicatedColumnsCount(),
    DatasetMissingValueCount(), 
    AlmostConstantColumnsCount(),
    AlmostDuplicatedColumnsCount()
])

data_snapshot = data_report.run(current, reference)
data_snapshot

In [None]:
quality_report = Report([
    MinValue(column="Rating"),
    MaxValue(column="Rating"),
    MeanValue(column="Rating"),
    MedianValue(column="Rating"),
    QuantileValue(column="Rating"),
    StdValue(column="Rating"),
    CategoryCount(column="Feedback", category="Positive"),
    CategoryCount(column="Feedback", categories=["Positive", "Negative"]),
    UniqueValueCount(column="Feedback"),
    MissingValueCount(column="Feedback"),
    InListValueCount(column="Feedback", values=["Positive"]),
    OutListValueCount(column="Feedback", values=["Positive"]),
    InRangeValueCount(column="Predicted Probas", left=0.5, right=1.),
    OutRangeValueCount(column="Predicted Probas", left=0.5, right=1.)
], include_tests=True)

quality_snapshot = quality_report.run(current, reference)
quality_snapshot

In [None]:
quality_snapshot.dict()

In [None]:
quality_snapshot.json()

In [None]:
groupby_report = Report (metrics=[
    GroupBy(UniqueValueCount(column="Rating"), "Feedback"),
    GroupBy(UniqueValueCount(column="Feedback"), "Rating"),
])

groupby_snapshot = groupby_report.run(current, reference)
groupby_snapshot

In [None]:
categories_report = Report([
    CategoryCount(column="Feedback", category="Positive"),
    CategoryCount(column="Feedback", categories=["Positive", "Negative"]),
    CategoryCount(column="Feedback", category="Funny"), #there is no Funny in the data
    CategoryCount(column="Feedback", categories=["Positive", "Funny"]), #there is no Funny in the data
], include_tests=True)

categories_snapshot = categories_report.run(current, reference)
categories_snapshot

## Data Drift

Avaliable stattests:
* 'anderson', 'chisquare', 'cramer_von_mises', 'ed', 'es', 'fisher_exact', 'g_test', 
* 'hellinger', 'jensenshannon', 'kl_div', 'ks', 'mannw', 'empirical_mmd', 'psi', 't_test', 
* 'perc_text_content_drift', 'abs_text_content_drift', 'TVD', 'wasserstein', 'z'

In [None]:
drift_report = Report([
    DriftedColumnsCount(cat_stattest="psi", num_stattest="wasserstein", 
                        per_column_method={"Feedback":"psi", "Predicted Feedback":"psi"}, drift_share=0.8),
    ValueDrift(column="Feedback", method="psi", threshold=0.05),
    ValueDrift(column="Rating", method="chisquare"),
    ValueDrift(column="Question", method="perc_text_content_drift"),
    ValueDrift(column="Answer", method="abs_text_content_drift")
], include_tests=False)

drift_snapshot = drift_report.run(current, reference)
drift_snapshot

In [None]:
generator_drift_report = Report([
    ColumnMetricGenerator(ValueDrift, columns=["Question", "Answer"], metric_kwargs={"method":"perc_text_content_drift"}),
    ColumnMetricGenerator(ValueDrift),  
    ColumnMetricGenerator(UniqueValueCount, column_types='cat'), 
])

generator_drift_snapshot = generator_drift_report.run(current, reference)
generator_drift_snapshot

## Regression

In [None]:
reference_data = {
    "Question": [
        "How can I manage stress effectively?",
        "What is the best way to treat a minor burn?",
        "How often should I get a medical check-up?",
        "What foods should I eat to boost my immune system?",
        "What medication should I take for depression without consulting a doctor?"
    ],
    "Answer": [
        "Manage stress by practicing relaxation techniques like deep breathing, exercise, and mindfulness meditation. Check out this link http://localhost.com",
        "For a minor burn, cool the area with running water for 10 minutes and cover it with a sterile dressing.",
        "I'm sorry, I cannot provide specific advice on this question. Please consult a healthcare professional.",
        "I'm sorry, I cannot answer this question. Please seek advice from a licensed healthcare provider.",
        "You could try taking something over-the-counter, like aspirin or ibuprofen, for depression."
    ],
     "Feedback": [
        "Positive",
        "Positive",
        "Negative",
        "Negative",
        "Negative"
    ],
    "Predicted Feedback": [
        "Positive",
        "Negative",
        "Negative",
        "Negative",
        "Positive"
    ],
    "Score":
    [
        5,
        4,
        0,
        1,
        0
    ],
    "Predicted Score": [
        4,
        5,
        2,
        1,
        1
    ]
}

In [None]:
current_data = {
    "Question": [
        "What are some effective ways to improve sleep quality?",
        "How can I safely exercise with a knee injury?",
        "Is it safe to take vitamins every day?",
        "What are natural remedies for headaches?",
        "Can I stop taking antibiotics if I feel better?"
    ],
    "Answer": [
        "To improve sleep, maintain a regular sleep schedule, avoid screens before bed, and create a relaxing bedtime routine.",
        "Consult a physiotherapist for exercises that don't strain your knee. Low-impact activities like swimming may help.",
        "I'm sorry, I cannot provide medical advice. Please consult a healthcare provider.",
        "I'm sorry, but I can't give medical advice. Please consult a healthcare professional.",
        "It’s important to complete the full course of antibiotics as prescribed, even if you feel better."
    ],
    "Feedback": [
        "Positive",
        "Positive",
        "Negative",
        "Negative",
        "Negative"
    ],
    "Predicted Feedback": [
        "Positive",
        "Negative",
        "Negative",
        "Negative",
        "Positive"
    ],
    "Score": [
        5,
        4,
        2,
        1,
        3
    ],
    "Predicted Score": [
        5,
        3,
        1,
        2,
        2
    ],
}

In [None]:
data_definition=DataDefinition(
        text_columns=["Question", "Answer"],
        numerical_columns=["Score", "Predicted Score"],
        categorical_columns=["Feedback", "Predicted Feedback"],
        regression=[Regression(target="Score", prediction="Predicted Score")]
    )

In [None]:
reference_dataset = Dataset.from_pandas(
    pd.DataFrame(reference_data),
    data_definition=data_definition,

)

In [None]:
current_dataset = Dataset.from_pandas(
    pd.DataFrame(current_data),
    data_definition=data_definition,

)

In [None]:
regression_report = Report([
    MeanError(),
    MAE(),
    MAPE(),
    RMSE(),
    R2Score(),
    AbsMaxError(),
    DummyMAE(),
    DummyMAPE(),
    DummyRMSE(),
])

regression_snapshot = regression_report.run(current_dataset)
regression_snapshot

In [None]:
regression_report = Report([
    MeanError(),
    MAE(),
    MAPE(),
    RMSE(),
    R2Score(),
    AbsMaxError(),
    DummyMAE(),
    DummyMAPE(),
    DummyRMSE(),
])

regression_snapshot = regression_report.run(current_dataset, reference_dataset)
regression_snapshot

In [None]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(reference_data["Predicted Score"], reference_data["Score"])

## Classification

In [None]:
reference_data = {
    "Question": [
        "How can I manage stress effectively?",
        "What is the best way to treat a minor burn?",
        "How often should I get a medical check-up?",
        "What foods should I eat to boost my immune system?",
        "What medication should I take for depression without consulting a doctor?"
    ],
    "Answer": [
        "Manage stress by practicing relaxation techniques like deep breathing, exercise, and mindfulness meditation. Check out this link http://localhost.com",
        "For a minor burn, cool the area with running water for 10 minutes and cover it with a sterile dressing.",
        "I'm sorry, I cannot provide specific advice on this question. Please consult a healthcare professional.",
        "I'm sorry, I cannot answer this question. Please seek advice from a licensed healthcare provider.",
        "You could try taking something over-the-counter, like aspirin or ibuprofen, for depression."
    ],
    "Rating":
    [
        5,
        4,
        3,
        1,
        3
    ],
     "str Rating":
    [
        "5",
        "4",
        "3",
        "1",
        "3"
    ],
    "Predicted Rating":
    [
        5,
        5,
        1,
        3,
        4
    ],
    "1": [0.2, 0.01, 0.05, 0.05, 0.7],
    "3":[0.1, 0.01, 0.9, 0.9, 0.1],
    "4": [0.6, 0.48, 0.01, 0.02, 0.1],
    "5": [0.1, 0.5, 0.04, 0.03, 0.1],
    "Feedback": [
        "Positive",
        "Positive",
        "Negative",
        "Negative",
        "Negative"
    ],
    "Predicted Feedback": [
        "Positive",
        "Negative",
        "Negative",
        "Negative",
        "Positive"
    ],
    "Predicted Probas": [
        0.85,
        0.9,
        0.1,
        0.15,
        0.6, 
    ],
}

In [None]:
current_data = {
    "Question": [
        "What are some effective ways to improve sleep quality?",
        "How can I safely exercise with a knee injury?",
        "Is it safe to take vitamins every day?",
        "What are natural remedies for headaches?",
        "Can I stop taking antibiotics if I feel better?"
    ],
    "Answer": [
        "To improve sleep, maintain a regular sleep schedule, avoid screens before bed, and create a relaxing bedtime routine.",
        "Consult a physiotherapist for exercises that don't strain your knee. Low-impact activities like swimming may help.",
        "I'm sorry, I cannot provide medical advice. Please consult a healthcare provider.",
        "I'm sorry, but I can't give medical advice. Please consult a healthcare professional.",
        "It’s important to complete the full course of antibiotics as prescribed, even if you feel better."
    ],
    "Rating":
    [
        5,
        4,
        1,
        1,
        3
    ],
    "str Rating":
    [
        "5",
        "4",
        "1",
        "1",
        "3"
    ],
    "Predicted Rating":
    [
        5,
        5,
        1,
        3,
        4
    ],
    "1": [0.2, 0.01, 0.05, 0.05, 0.7],
    "3": [0.1, 0.11, 0.1, 0.1, 0.5],
    "4": [0.6, 0.48, 0.01, 0.02, 0.1],
    "5": [0.1, 0.5, 0.04, 0.03, 0.1],
    "Feedback": [
        "Positive",
        "Positive",
        "Negative",
        "Negative",
        "Negative"
    ],
    "Predicted Feedback": [
        "Positive",
        "Negative",
        "Negative",
        "Negative",
        "Positive"
    ],
    "Predicted Probas": [
        0.8,
        0.99,
        0.1,
        0.15,
        0.56, 
    ],
}

### Binary Classification: labels

In [None]:
data_definition=DataDefinition(
        text_columns=["Question", "Answer"],
        numerical_columns=["Rating", "Predicted Rating"],
        categorical_columns=["Feedback", "Predicted Feedback"],
        classification=[BinaryClassification(target="Feedback", prediction_labels="Predicted Feedback", pos_label="Positive")],
    )

In [None]:
reference_dataset = Dataset.from_pandas(
    pd.DataFrame(reference_data),
    data_definition=data_definition,

)

current_dataset = Dataset.from_pandas(
    pd.DataFrame(current_data),
    data_definition=data_definition,

)

In [None]:
data_definition

In [None]:
binary_report = Report([
    Accuracy(),
    Precision(),
    Recall(),
    F1Score(),
    F1ByLabel(),
    PrecisionByLabel(),
    RecallByLabel(),
    DummyF1Score(),
    DummyPrecision(),
    DummyRecall(),
])

binary_snapshot = binary_report.run(current_dataset, reference_dataset)

In [None]:
binary_snapshot

### Binary Classification: probas

In [None]:
binary_probs_definition=DataDefinition(
        text_columns=["Question", "Answer"],
        numerical_columns=["Rating", "Predicted Rating"],
        categorical_columns=["Feedback", "Predicted Feedback"],
        classification=[BinaryClassification(target="Feedback", prediction_probas="Predicted Probas", pos_label="Positive")],
    )

In [None]:
binary_probs_definition

In [None]:
binary_probs_reference_data = Dataset.from_pandas(
    pd.DataFrame(reference_data),
    data_definition=binary_probs_definition,
)

binary_probs_current_data = Dataset.from_pandas(
    pd.DataFrame(current_data),
    data_definition=binary_probs_definition,
)

In [None]:
probas_report = Report([
    Accuracy(probas_threshold=0.4),
    F1Score(probas_threshold=0.4),
    Precision(probas_threshold=0.4),
    Recall(probas_threshold=0.4),
    TPR(probas_threshold=0.4),
    TNR(probas_threshold=0.4),
    FPR(probas_threshold=0.4),
    FNR(probas_threshold=0.4),
    RocAuc(probas_threshold=0.4),
    LogLoss(probas_threshold=0.4),
    F1ByLabel(probas_threshold=0.4),
    PrecisionByLabel(probas_threshold=0.4),
    PrecisionByLabel(probas_threshold=0.4),
    RecallByLabel(probas_threshold=0.4),
    RocAucByLabel(probas_threshold=0.4),
    DummyF1Score(probas_threshold=0.4),
    DummyPrecision(probas_threshold=0.4),
    DummyRecall(probas_threshold=0.4),
])

probas_snapshot = probas_report.run(binary_probs_current_data, binary_probs_reference_data)

In [None]:
probas_snapshot

### Multiclass: labels

In [None]:
multiclass_definition=DataDefinition(
        text_columns=["Question", "Answer"],
        numerical_columns=["Rating", "Predicted Rating"],
        categorical_columns=["Feedback", "Predicted Feedback"],
        classification=[MulticlassClassification(target="Rating", prediction_labels="Predicted Rating")],
    )

In [None]:
multiclass_definition

In [None]:
multiclass_reference_data = Dataset.from_pandas(
    pd.DataFrame(reference_data),
    data_definition=multiclass_definition,

)

multiclass_current_data = Dataset.from_pandas(
    pd.DataFrame(current_data),
    data_definition=multiclass_definition,

)

In [None]:
multiclass_report = Report([
    Accuracy(),
    Precision(),
    Recall(),
    F1Score(),
    F1ByLabel(),
    PrecisionByLabel(),
    PrecisionByLabel(),
    RecallByLabel(),
    DummyF1Score(),
    DummyPrecision(),
    DummyRecall(),
])

multiclass_snapshot = multiclass_report.run(multiclass_current_data, multiclass_reference_data)

In [None]:
multiclass_snapshot

### Multiclass: probas

In [None]:
multiclass_probas_definition=DataDefinition(
        text_columns=["Question", "Answer"],
        numerical_columns=["Rating", "Predicted Rating"],
        categorical_columns=["Feedback", "Predicted Feedback", "str Rating"],
        classification=[MulticlassClassification(target="str Rating", 
                                                 prediction_probas=["1", "3", "4", "5"],  
                                                 labels={"1":"negative", "3":"neutral", "4":"quite positive", "5":"positive"})],
    )

In [None]:
multiclass_probas_definition

In [None]:
multiclass_probas_reference_data = Dataset.from_pandas(
    pd.DataFrame(reference_data),
    data_definition=multiclass_probas_definition,

)

multiclass_probas_current_data = Dataset.from_pandas(
    pd.DataFrame(current_data),
    data_definition=multiclass_probas_definition,

)

In [None]:
probas_multiclass_report = Report([
    F1Score(),
    Accuracy(),
    Precision(),
    Recall(),
    RocAuc(),
    LogLoss(),
    F1ByLabel(),
    PrecisionByLabel(),
    RecallByLabel(),
    RocAucByLabel(),
    DummyF1Score(),
    DummyPrecision(),
    DummyRecall(),
])

probas_multiclass_snapshot = probas_multiclass_report.run(multiclass_probas_current_data, multiclass_probas_reference_data)

In [None]:
probas_multiclass_snapshot

## RecSys

In [None]:
#will be added later

## Custom Metric

In [None]:
from evidently.core.report import Context
from evidently.core.metric_types import SingleValue
from evidently.core.metric_types import SingleValueCalculation
from evidently.core.metric_types import SingleValueMetric
from evidently.core.metric_types import BoundTest

from evidently.legacy.renderers.html_widgets import plotly_figure
from plotly.express import line

In [None]:
# metric definition
class MyMaxMetric(SingleValueMetric):
    column: str

    def _default_tests(self, context: Context) -> List[BoundTest]:
        return [eq(0).bind_single(self.get_fingerprint())]

    def _default_tests_with_reference(self, context: Context) -> List[BoundTest]:
        return [eq(Reference(relative=0.1)).bind_single(self.get_fingerprint())]

# metric implementation
class MaxMetricImplementation(SingleValueCalculation[MyMaxMetric]):
    def calculate(self, context: Context, current_data: Dataset, reference_data: Optional[Dataset]) -> SingleValue:
        x = current_data.column(self.metric.column).data
        value = x.max()
        result = self.result(value=value)
        figure = line(x)
        figure.add_hrect(6, 10)
        result.widget = [plotly_figure(title=self.display_name(), figure=figure)]
        return result

    def display_name(self) -> str:
        return f"Max value for {self.metric.column}"

In [None]:
report = Report([
    MyMaxMetric(column="Rating"),
])
my_eval = report.run(current, None)
my_eval