# How to get report data in CSV format?

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml

from evidently.report import Report
from evidently.metrics.base_metric import generate_column_metrics
from evidently.metrics import ColumnSummaryMetric, ColumnMissingValuesMetric

from evidently.test_suite import TestSuite
from evidently.tests import TestNumberOfColumns
from evidently.tests import TestColumnsType
from evidently.tests import TestNumberOfEmptyRows
from evidently.tests import TestNumberOfEmptyColumns
from evidently.tests import TestNumberOfDuplicatedRows
from evidently.tests import TestNumberOfDuplicatedColumns

from evidently import ColumnMapping

In [2]:
data = fetch_openml(name='adult', version=2, as_frame='auto')
reference = data.frame[:10000]
current = data.frame[10000:20000]

columns = ColumnMapping(
    target='class',
    numerical_features=['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'],
    categorical_features=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
)

## Column Summary to csv through pandas dataframe

In [3]:
column_summary = Report(metrics=[
    generate_column_metrics(ColumnSummaryMetric, columns='all'),
])

column_summary.run(reference_data=reference, current_data=current, column_mapping=columns)

In [4]:
column_summary_dict = column_summary.as_dict()

In [5]:
data = {}
for result in column_summary_dict['metrics']:
    data[result['result']['column_name']] = {
        **{f"ref_{key}": val for key, val in result['result']['reference_characteristics'].items()},
        **{f"cur_{key}": val for key, val in result['result']['current_characteristics'].items()}
    }
    

In [6]:
column_summary_frame = pd.DataFrame.from_dict(data, orient='index')

In [7]:
column_summary_frame

Unnamed: 0,ref_number_of_rows,ref_count,ref_unique,ref_unique_percentage,ref_most_common,ref_most_common_percentage,ref_missing,ref_missing_percentage,ref_new_in_current_values_count,ref_unused_in_current_values_count,...,ref_infinite_percentage,cur_mean,cur_std,cur_min,cur_p25,cur_p50,cur_p75,cur_max,cur_infinite_count,cur_infinite_percentage
workclass,10000,9397,8,0.08,Private,68.43,603,6.03,,,...,,,,,,,,,,
education,10000,10000,16,0.16,HS-grad,32.38,0,0.0,,,...,,,,,,,,,,
marital-status,10000,10000,7,0.07,Married-civ-spouse,45.01,0,0.0,,,...,,,,,,,,,,
occupation,10000,9396,14,0.14,Prof-specialty,12.69,604,6.04,,,...,,,,,,,,,,
relationship,10000,10000,6,0.06,Husband,39.72,0,0.0,,,...,,,,,,,,,,
race,10000,10000,5,0.05,White,85.79,0,0.0,,,...,,,,,,,,,,
sex,10000,10000,2,0.02,Male,66.74,0,0.0,,,...,,,,,,,,,,
native-country,10000,9829,40,0.4,United-States,90.28,171,1.71,,,...,,,,,,,,,,
age,10000,10000,71,0.71,23.0,2.87,0,0.0,,,...,0.0,38.8,13.72,17.0,28.0,37.0,48.0,90.0,0.0,0.0
capital-gain,10000,10000,105,1.05,0.0,91.78,0,0.0,,,...,0.0,1000.19,6907.17,0.0,0.0,0.0,0.0,99999.0,0.0,0.0


In [8]:
#column_summary_frame.to_csv('column_summary_frame.csv', header='True', sep=',', index=True, index_label='column')

## ColumnMissingValuesMetric to csv through pandas dataframe

In [9]:
missing_values = Report(metrics=[
    generate_column_metrics(ColumnMissingValuesMetric, columns='all'),
])

missing_values.run(reference_data=reference, current_data=current, column_mapping=columns)

In [10]:
missing_values_dict = missing_values.as_dict()

data = {}
for result in missing_values_dict['metrics']:
    data[result['result']['column_name']] = {
        **{f"ref_{key}": val for key, val in result['result']['reference'].items()},
        **{f"cur_{key}": val for key, val in result['result']['current'].items()}
    }

In [11]:
missing_values_frame = pd.DataFrame.from_dict(data, orient='index')

In [12]:
missing_values_frame

Unnamed: 0,ref_number_of_rows,ref_different_missing_values,ref_number_of_different_missing_values,ref_number_of_missing_values,ref_share_of_missing_values,cur_number_of_rows,cur_different_missing_values,cur_number_of_different_missing_values,cur_number_of_missing_values,cur_share_of_missing_values
workclass,10000,"{None: 603, '': 0, -inf: 0, inf: 0}",1,603,0.0603,10000,"{None: 595, '': 0, -inf: 0, inf: 0}",1,595,0.0595
education,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0
marital-status,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0
occupation,10000,"{None: 604, '': 0, -inf: 0, inf: 0}",1,604,0.0604,10000,"{None: 597, '': 0, -inf: 0, inf: 0}",1,597,0.0597
relationship,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0
race,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0
sex,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0
native-country,10000,"{None: 171, '': 0, -inf: 0, inf: 0}",1,171,0.0171,10000,"{None: 177, '': 0, -inf: 0, inf: 0}",1,177,0.0177
age,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0
capital-gain,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0


In [13]:
missing_values_frame.to_csv('missing_values_frame.csv', header='True', sep=',', index=True, index_label='column')

## Several column-based metrics in csv

In [14]:
column_metrics_frame = pd.merge(column_summary_frame, missing_values_frame, left_index=True, right_index=True)

In [15]:
column_metrics_frame

Unnamed: 0,ref_number_of_rows_x,ref_count,ref_unique,ref_unique_percentage,ref_most_common,ref_most_common_percentage,ref_missing,ref_missing_percentage,ref_new_in_current_values_count,ref_unused_in_current_values_count,...,ref_number_of_rows_y,ref_different_missing_values,ref_number_of_different_missing_values,ref_number_of_missing_values,ref_share_of_missing_values,cur_number_of_rows_y,cur_different_missing_values,cur_number_of_different_missing_values,cur_number_of_missing_values,cur_share_of_missing_values
workclass,10000,9397,8,0.08,Private,68.43,603,6.03,,,...,10000,"{None: 603, '': 0, -inf: 0, inf: 0}",1,603,0.0603,10000,"{None: 595, '': 0, -inf: 0, inf: 0}",1,595,0.0595
education,10000,10000,16,0.16,HS-grad,32.38,0,0.0,,,...,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0
marital-status,10000,10000,7,0.07,Married-civ-spouse,45.01,0,0.0,,,...,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0
occupation,10000,9396,14,0.14,Prof-specialty,12.69,604,6.04,,,...,10000,"{None: 604, '': 0, -inf: 0, inf: 0}",1,604,0.0604,10000,"{None: 597, '': 0, -inf: 0, inf: 0}",1,597,0.0597
relationship,10000,10000,6,0.06,Husband,39.72,0,0.0,,,...,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0
race,10000,10000,5,0.05,White,85.79,0,0.0,,,...,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0
sex,10000,10000,2,0.02,Male,66.74,0,0.0,,,...,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0
native-country,10000,9829,40,0.4,United-States,90.28,171,1.71,,,...,10000,"{None: 171, '': 0, -inf: 0, inf: 0}",1,171,0.0171,10000,"{None: 177, '': 0, -inf: 0, inf: 0}",1,177,0.0177
age,10000,10000,71,0.71,23.0,2.87,0,0.0,,,...,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0
capital-gain,10000,10000,105,1.05,0.0,91.78,0,0.0,,,...,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0,10000,"{'': 0, -inf: 0, None: 0, inf: 0}",0,0,0.0


In [16]:
#column_metrics_frame.to_csv('column_metrics_frame.csv', header='True', sep=',', index=True, index_label='column')

## Test results in csv format

In [17]:
dataset_tests = TestSuite(tests=[
    TestNumberOfColumns(),
    TestColumnsType(),
    TestNumberOfEmptyRows(),
    TestNumberOfEmptyColumns(),
    TestNumberOfDuplicatedRows(),
    TestNumberOfDuplicatedColumns()
])

dataset_tests.run(reference_data=reference, current_data=current, column_mapping=columns)

In [18]:
dataset_tests_dict = dataset_tests.as_dict()

data = []
for result in dataset_tests_dict['tests']:
    data.append({
        'test':result['name'],
        'group':result['group'],
        'status':result['status'],
        }
    )

In [19]:
dataset_tests_frame = pd.DataFrame.from_records(data)

In [20]:
dataset_tests_frame

Unnamed: 0,test,group,status
0,Number of Columns,data_integrity,SUCCESS
1,Column Types,data_integrity,SUCCESS
2,Number of Empty Rows,data_integrity,SUCCESS
3,Number of Empty Columns,data_integrity,SUCCESS
4,Number of Duplicate Rows,data_integrity,SUCCESS
5,Number of Duplicate Columns,data_integrity,SUCCESS


In [21]:
#dataset_tests_frame.to_csv('dataset_tests_frame.csv', header='True', sep=',', index=True, index_label='index')