# Demonstration of profiling capabilities of whylog

In [1]:
import pandas as pd
import numpy as np

# Load the IRIS dataset in a pandas DataFrame
iris_url = "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv"
iris_df = pd.read_csv(iris_url)
iris_df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


We split this in two random parts, the "baseline" and the "update".

In [2]:
baseline_df = iris_df.sample(n=100, random_state=123)
target_df = iris_df.loc[iris_df.index.difference(baseline_df.index), :]

# Generate whylog profiles

In [5]:
import whylogs as why

base_profile = why.log(pandas=baseline_df)
base_profile.view().to_pandas().T

column,sepal.length,sepal.width,petal.length,variety,petal.width
counts/n,100,100,100,100,100
counts/null,0,0,0,0,0
types/integral,0,0,0,0,0
types/fractional,100,100,100,0,100
types/boolean,0,0,0,0,0
types/string,0,0,0,100,0
types/object,0,0,0,0,0
cardinality/est,31.000002,22.000001,40.000004,3.0,22.000001
cardinality/upper_1,31.00155,22.0011,40.002001,3.00015,22.0011
cardinality/lower_1,31.0,22.0,40.0,3.0,22.0


In [6]:
target_profile = why.log(pandas=target_df)
target_profile.view().to_pandas().T

column,sepal.length,sepal.width,petal.length,variety,petal.width
counts/n,50,50,50,50,50
counts/null,0,0,0,0,0
types/integral,0,0,0,0,0
types/fractional,50,50,50,0,50
types/boolean,0,0,0,0,0
types/string,0,0,0,50,0
types/object,0,0,0,0,0
cardinality/est,25.000001,16.000001,28.000002,3.0,18.000001
cardinality/upper_1,25.00125,16.000799,28.0014,3.00015,18.000899
cardinality/lower_1,25.0,16.0,28.0,3.0,18.0


# Compare the baseline profile and the update profile

In [7]:
from whylogs.viz import NotebookProfileVisualizer

visualization = NotebookProfileVisualizer()
visualization.set_profiles(
    target_profile_view=target_profile.view(), 
    reference_profile_view=base_profile.view())
visualization.summary_drift_report()

# Profile a Spark DataFrame

In [8]:
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.appName("profile").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/06/27 12:24:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/27 12:24:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [9]:
new_names = {n: n.replace(".", "_") for n in iris_df.columns}
iris_df = iris_df.rename(new_names, axis=1)

In [10]:
iris_df_spark = spark.createDataFrame(iris_df)

In [11]:
iris_df_spark.head(5)

                                                                                

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, variety='Setosa'),
 Row(sepal_length=4.9, sepal_width=3.0, petal_length=1.4, petal_width=0.2, variety='Setosa'),
 Row(sepal_length=4.7, sepal_width=3.2, petal_length=1.3, petal_width=0.2, variety='Setosa'),
 Row(sepal_length=4.6, sepal_width=3.1, petal_length=1.5, petal_width=0.2, variety='Setosa'),
 Row(sepal_length=5.0, sepal_width=3.6, petal_length=1.4, petal_width=0.2, variety='Setosa')]

In [12]:
from whylogs.api.pyspark.experimental import collect_dataset_profile_view
iris_profile_view = collect_dataset_profile_view(input_df=iris_df_spark)

  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res_df.append(df_temp)
  res_df = res

                                                                                

In [13]:
iris_profile_view.to_pandas().T

column,petal_length,petal_width,sepal_length,sepal_width,variety
counts/n,150,150,150,150,150
counts/null,0,0,0,0,0
types/integral,0,0,0,0,0
types/fractional,150,150,150,150,0
types/boolean,0,0,0,0,0
types/string,0,0,0,0,150
types/object,0,0,0,0,0
cardinality/est,43.000004,22.000001,35.000003,23.000001,3.0
cardinality/upper_1,43.002151,22.0011,35.00175,23.00115,3.00015
cardinality/lower_1,43.0,22.0,35.0,23.0,3.0
