# Generate Data

In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
from mlchecks import Dataset
from mlchecks.checks.integrity.rare_format_detection import RareFormatDetection


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
np.random.seed(42)

In [4]:
datelist = pd.date_range(start=datetime.today(), periods=200, freq='D', normalize=True)
s_date = pd.Series([d.strftime('%Y-%m-%d') for d in datelist], name='date')

emaillist = [''.join(np.random.choice(a=list('abcdefghijklmnopqrstuvwxyz'), p=[1/26]*26, size=np.random.choice(a=[6,7,8], p=[0.2, 0.5, 0.3]))) + '@gmail.com' for x in range(200)]
s_email = pd.Series(emaillist, name='email')

df = pd.DataFrame([s_date, s_email]).T

In [5]:
df.head()

Unnamed: 0,date,email
0,2021-10-23,ytpeebw@gmail.com
1,2021-10-24,sazvfee@gmail.com
2,2021-10-25,nlhpdhj@gmail.com
3,2021-10-26,ufnpbpe@gmail.com
4,2021-10-27,yzvhcr@gmail.com


# Run Check - Find no changes

In [6]:
ds = Dataset(df)
check = RareFormatDetection()

check.run(dataset=ds)

# Induce changes to data

In [7]:
# Change dates to different format:
df['date'].loc[0:2] = [datetime.strptime(d, '%Y-%m-%d').strftime('%Y-%b-%d') for d in df['date'].loc[0:2]]

# Change emails to have mistakes in format
df['email'].loc[[0,1]] = ['myname@gmail.com1', 'myname@gmail.co']

In [8]:
df.head()

Unnamed: 0,date,email
0,2021-Oct-23,myname@gmail.com1
1,2021-Oct-24,myname@gmail.co
2,2021-Oct-25,nlhpdhj@gmail.com
3,2021-10-26,ufnpbpe@gmail.com
4,2021-10-27,yzvhcr@gmail.com


# Run Check - Find rare formats

In [9]:
ds = Dataset(df)
check = RareFormatDetection()

check.run(dataset=ds)

Unnamed: 0,digits and letters format (case sensitive)
ratio of rare patterns to common patterns,1.52%
common formats,['0000-00-00']
examples for values in common formats,['2021-10-26']
values in rare formats,"['2021-Oct-23', '2021-Oct-24', '2021-Oct-25']"


Unnamed: 0,digits and letters format (case sensitive)
ratio of rare patterns to common patterns,1.01%
common formats,"['xxxxxxx@xxxxx.xxx', 'xxxxxxxx@xxxxx.xxx', 'xxxxxx@xxxxx.xxx']"
examples for values in common formats,"['nlhpdhj@gmail.com', 'cfbikhvj@gmail.com', 'yzvhcr@gmail.com']"
values in rare formats,"['myname@gmail.com1', 'myname@gmail.co']"


In [10]:
x = check.run(dataset=ds)

In [11]:
x.value

{'date':                                           digits and letters format (case sensitive)
 ratio of rare patterns to common patterns                                      1.52%
 common formats                                                          [0000-00-00]
 examples for values in common formats                                   [2021-10-26]
 values in rare formats                       [2021-Oct-23, 2021-Oct-24, 2021-Oct-25],
 'email':                                                   digits and letters format (case sensitive)
 ratio of rare patterns to common patterns                                              1.01%
 common formats                             [xxxxxxx@xxxxx.xxx, xxxxxxxx@xxxxx.xxx, xxxxxx...
 examples for values in common formats      [nlhpdhj@gmail.com, cfbikhvj@gmail.com, yzvhcr...
 values in rare formats                                  [myname@gmail.com1, myname@gmail.co]}