# How to run calculations over text data?

In [1]:
import pandas as pd
import numpy as np

from sklearn import datasets, ensemble, model_selection

In [2]:
from evidently import ColumnMapping
from evidently.report import Report
from evidently.test_suite import TestSuite

from evidently.metric_preset import DataDriftPreset
from evidently.metric_preset import DataQualityPreset
from evidently.metric_preset import RegressionPreset
from evidently.metric_preset import ClassificationPreset
from evidently.metric_preset import TargetDriftPreset
from evidently.metric_preset import TextOverviewPreset

from evidently.metrics import *

from evidently.test_preset import NoTargetPerformanceTestPreset
from evidently.test_preset import DataStabilityTestPreset
from evidently.test_preset import DataQualityTestPreset
from evidently.test_preset import DataDriftTestPreset

from evidently.tests import *

from evidently.tests.base_test import generate_column_tests
from evidently.metrics.base_metric import generate_column_metrics

In [3]:
import nltk
nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package words to /Users/emelidral/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/emelidral/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/emelidral/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
#Dataset for Data Quality and Integrity
reviews_data = datasets.fetch_openml(name='Womens-E-Commerce-Clothing-Reviews', version=2, as_frame='auto')
reviews = reviews_data.frame

In [5]:
reviews['prediction'] = reviews['Rating']
reviews_ref = reviews[reviews.Rating > 3].sample(n=5000, replace=True, ignore_index=True, random_state=42) #.dropna()
reviews_cur = reviews[reviews.Rating < 3].sample(n=5000, replace=True, ignore_index=True, random_state=42) #.dropna()

In [6]:
reviews.head()

Unnamed: 0,Unnamed:_0,Clothing_ID,Age,Title,Review_Text,Rating,Recommended_IND,Positive_Feedback_Count,Division_Name,Department_Name,Class_Name,prediction
0,0.0,767.0,33.0,,Absolutely wonderful - silky and sexy and comf...,4.0,1.0,0.0,Initmates,Intimate,Intimates,4.0
1,1.0,1080.0,34.0,,Love this dress! it's sooo pretty. i happene...,5.0,1.0,4.0,General,Dresses,Dresses,5.0
2,2.0,1077.0,60.0,Some major design flaws,I had such high hopes for this dress and reall...,3.0,0.0,0.0,General,Dresses,Dresses,3.0
3,3.0,1049.0,50.0,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5.0,1.0,0.0,General Petite,Bottoms,Pants,5.0
4,4.0,847.0,47.0,Flattering shirt,This shirt is very flattering to all due to th...,5.0,1.0,6.0,General,Tops,Blouses,5.0


In [7]:
column_mapping = ColumnMapping(
    target='Rating',
    numerical_features=['Age', 'Positive_Feedback_Count'],
    categorical_features=['Division_Name', 'Department_Name', 'Class_Name'],
    text_features=['Review_Text', 'Title']
)

In [15]:
report.json()

'{"version": "0.2.5", "timestamp": "2023-02-28 14:58:36.920637", "metrics": [{"metric": "TextDescriptorsDriftMetric", "result": {"number_of_columns": 1, "number_of_drifted_columns": 1, "share_of_drifted_columns": 1.0, "dataset_drift": true, "drift_by_columns": {"Review Text Length": {"column_name": "Review Text Length", "column_type": "num", "stattest_name": "Wasserstein distance (normed)", "drift_score": 0.13706649730383674, "drift_detected": true, "threshold": 0.1, "typical_examples_cur": null, "typical_examples_ref": null, "typical_words_cur": null, "typical_words_ref": null}}}}]}'

# Metric Presest

In [8]:
data_drift_report = Report(metrics=[
    DataDriftPreset(num_stattest='ks', cat_stattest='psi', num_stattest_threshold=0.2, cat_stattest_threshold=0.2),
])

data_drift_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
data_drift_report

In [9]:
data_drift_report.json()

'{"version": "0.2.2", "timestamp": "2023-02-03 14:37:34.889204", "metrics": [{"metric": "DatasetDriftMetric", "result": {"drift_share": 0.5, "number_of_columns": 9, "number_of_drifted_columns": 6, "share_of_drifted_columns": 0.6666666666666666, "dataset_drift": true}}, {"metric": "DataDriftTable", "result": {"number_of_columns": 9, "number_of_drifted_columns": 6, "share_of_drifted_columns": 0.6666666666666666, "dataset_drift": true, "drift_by_columns": {"Rating": {"column_name": "Rating", "column_type": "cat", "stattest_name": "PSI", "drift_score": 17.19035240686541, "drift_detected": true, "threshold": 0.2, "typical_examples_cur": null, "typical_examples_ref": null, "typical_words_cur": null, "typical_words_ref": null}, "prediction": {"column_name": "prediction", "column_type": "cat", "stattest_name": "PSI", "drift_score": 17.19035240686541, "drift_detected": true, "threshold": 0.2, "typical_examples_cur": null, "typical_examples_ref": null, "typical_words_cur": null, "typical_words_r

In [10]:
data_quality_report = Report(metrics=[
    DataQualityPreset()
])

data_quality_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
data_quality_report

In [11]:
data_quality_report.as_dict()

{'metrics': [{'metric': 'DatasetSummaryMetric',
   'result': {'almost_duplicated_threshold': 0.95,
    'current': {'target': 'Rating',
     'prediction': 'prediction',
     'date_column': None,
     'id_column': None,
     'number_of_columns': 12,
     'number_of_rows': 5000,
     'number_of_missing_values': 781,
     'number_of_categorical_columns': 3,
     'number_of_numeric_columns': 2,
     'number_of_text_columns': 2,
     'number_of_datetime_columns': 0,
     'number_of_constant_columns': 0,
     'number_of_almost_constant_columns': 0,
     'number_of_duplicated_columns': 1,
     'number_of_almost_duplicated_columns': 1,
     'number_of_empty_rows': 0,
     'number_of_empty_columns': 0,
     'number_of_duplicated_rows': 2904,
     'nans_by_columns': {'Unnamed:_0': 0,
      'Clothing_ID': 0,
      'Age': 0,
      'Title': 710,
      'Review_Text': 71,
      'Rating': 0,
      'Recommended_IND': 0,
      'Positive_Feedback_Count': 0,
      'Division_Name': 0,
      'Department_Name

In [12]:
data_quality_report.json()

'{"version": "0.2.2", "timestamp": "2023-02-03 14:37:38.265375", "metrics": [{"metric": "DatasetSummaryMetric", "result": {"almost_duplicated_threshold": 0.95, "current": {"target": "Rating", "prediction": "prediction", "date_column": null, "id_column": null, "number_of_columns": 12, "number_of_rows": 5000, "number_of_missing_values": 781, "number_of_categorical_columns": 3, "number_of_numeric_columns": 2, "number_of_text_columns": 2, "number_of_datetime_columns": 0, "number_of_constant_columns": 0, "number_of_almost_constant_columns": 0, "number_of_duplicated_columns": 1, "number_of_almost_duplicated_columns": 1, "number_of_empty_rows": 0, "number_of_empty_columns": 0, "number_of_duplicated_rows": 2904, "nans_by_columns": {"Unnamed:_0": 0, "Clothing_ID": 0, "Age": 0, "Title": 710, "Review_Text": 71, "Rating": 0, "Recommended_IND": 0, "Positive_Feedback_Count": 0, "Division_Name": 0, "Department_Name": 0, "Class_Name": 0, "prediction": 0}, "number_uniques_by_columns": {"Unnamed:_0": 20

In [None]:
target_drift_report = Report(metrics=[
    TargetDriftPreset()
])

target_drift_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
target_drift_report

In [None]:
target_drift_report.json()

In [15]:
regression_report = Report(metrics=[
    RegressionPreset()
])

regression_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=ColumnMapping(
    target='Rating',
    prediction='prediction',
    numerical_features=['Age', 'Positive_Feedback_Count'],
    categorical_features=['Division_Name', 'Department_Name', 'Class_Name'],
    text_features=['Review_Text', 'Title'],
    task='regression'
  )
)
regression_report

ValueError: Prediction type is categorical but task is regression

In [None]:
regression_report.json()

In [None]:
classification_report = Report(metrics=[
    ClassificationPreset()
])
classification_report.run(reference_data=reviews.sample(n=5000, replace=False), current_data=reviews.sample(n=5000, replace=False), column_mapping=ColumnMapping(
    target='Rating',
    prediction='prediction',
    numerical_features=['Age', 'Positive_Feedback_Count'],
    categorical_features=['Division_Name', 'Department_Name', 'Class_Name'],
    text_features=['Review_Text', 'Title'],
    task='classification'
  )
)

classification_report

In [None]:
classification_report.json()

# Text Overview Preset

In [None]:
text_overview_report = Report(metrics=[
    TextOverviewPreset(column_name="Review_Text")
])

text_overview_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
text_overview_report

In [None]:
text_overview_report.json()

# General Metrics

In [9]:
dataset_metrics_with_text_report = Report(metrics=[
    DatasetSummaryMetric(), 
    DatasetMissingValuesMetric(),
    DatasetCorrelationsMetric(), 
    ConflictTargetMetric(),
    ConflictPredictionMetric(),
    DatasetDriftMetric(),
    DataDriftTable(),
    TargetByFeaturesTable(columns=['Review_Text', 'Title']),
    ClassificationQualityByFeatureTable(), 
])

dataset_metrics_with_text_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
dataset_metrics_with_text_report

In [None]:
dataset_metrics_with_text_report.json()

In [11]:
column_metrics_with_text_report = Report(metrics=[
    ColumnSummaryMetric(column_name="Review_Text"),
    ColumnSummaryMetric(column_name="Age"),
    ColumnMissingValuesMetric(column_name="Review_Text"), 
    ColumnRegExpMetric(column_name="Review_Text", reg_exp=r'.*love*.'),
    ColumnDriftMetric(column_name="Review_Text"),
    ColumnSummaryMetric(column_name="Title"),
    ColumnMissingValuesMetric(column_name="Title"), 
    ColumnRegExpMetric(column_name="Title", reg_exp=r".*love*."),
    ColumnDriftMetric(column_name="Title"),
])

column_metrics_with_text_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
column_metrics_with_text_report

In [None]:
column_metrics_with_text_report.json()

# Text Metrics

In [None]:
text_specific_metrics_report = Report(metrics=[
    TextDescriptorsDriftMetric(column_name="Review_Text"),
    TextDescriptorsDistribution(column_name="Review_Text"),
    TextDescriptorsCorrelationMetric(column_name="Review_Text"),
])

text_specific_metrics_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
text_specific_metrics_report

In [None]:
text_specific_metrics_report.json()

# Test Suites

In [None]:
no_target_performance_suite = TestSuite(tests=[
    NoTargetPerformanceTestPreset()
])

no_target_performance_suite.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
no_target_performance_suite


In [None]:
no_target_performance_suite.json()

In [None]:
data_stability_suite = TestSuite(tests=[
    DataStabilityTestPreset()
])

data_stability_suite.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
data_stability_suite

In [None]:
data_stability_suite.json()

In [None]:
data_quality_suite = TestSuite(tests=[
    DataQualityTestPreset()
])

data_quality_suite.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
data_quality_suite

In [None]:
data_quality_suite.json()

In [None]:
data_drift_suite = TestSuite(tests=[
    DataDriftTestPreset()
])

data_drift_suite.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
data_drift_suite

In [None]:
data_drift_suite.json()

# Tests with Text

In [None]:
dataset_tests_with_text_columns_suite = TestSuite(tests=[
    TestNumberOfRows(),
    TestNumberOfColumns(),
    TestNumberOfMissingValues(),
    TestShareOfMissingValues(),
    TestNumberOfColumnsWithMissingValues(),
    TestShareOfColumnsWithMissingValues(),
    TestNumberOfRowsWithMissingValues(),
    TestShareOfRowsWithMissingValues(),
    TestNumberOfDifferentMissingValues(),
    TestNumberOfConstantColumns(),
    TestNumberOfEmptyRows(),
    TestNumberOfEmptyColumns(),
    TestNumberOfDuplicatedRows(),
    TestNumberOfDuplicatedColumns(),
    TestColumnsType(),
    TestConflictTarget(),
    TestConflictPrediction(),
    TestHighlyCorrelatedColumns(),
    TestTargetFeaturesCorrelations(),
    TestPredictionFeaturesCorrelations(),
    TestCorrelationChanges(),
    TestNumberOfDriftedColumns(),
    TestShareOfDriftedColumns(),
])

dataset_tests_with_text_columns_suite.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
dataset_tests_with_text_columns_suite

In [None]:
dataset_tests_with_text_columns_suite.json()

In [None]:
text_column_test_suite = TestSuite(tests=[
    TestColumnNumberOfMissingValues(column_name='Review_Text'),
    TestColumnShareOfMissingValues(column_name='Review_Text'),
    TestColumnNumberOfDifferentMissingValues(column_name='Review_Text'),
    TestColumnRegExp(column_name='Review_Text', reg_exp='.*love*.'),
    TestColumnDrift(column_name='Review_Text'),
    # the following tests will be adopted to text data later:
    TestColumnAllConstantValues(column_name='Review_Text'),
    TestColumnAllUniqueValues(column_name='Review_Text'),
    TestNumberOfUniqueValues(column_name='Review_Text'),
    TestUniqueValuesShare(column_name='Review_Text'),
    TestMostCommonValueShare(column_name='Review_Text'),
    
])

text_column_test_suite.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
text_column_test_suite

# Generators

In [None]:
suite = TestSuite(tests=[generate_column_tests(TestColumnShareOfMissingValues)]) 
suite.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping) 
suite.show()

In [None]:
suite = TestSuite(tests=[generate_column_tests(TestColumnShareOfMissingValues, columns="text")]) 
suite.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping) 
suite.show()

In [None]:
report = Report(
    metrics=[generate_column_metrics(ColumnDriftMetric, columns="text")]
)
report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
report