# Generate Data

In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
from mlchecks import Dataset, validate_dataset, validate_dataset_or_dataframe
from mlchecks.checks.integrity.rare_format_detection import RareFormatDetection


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
np.random.seed(42)

# df = pd.DataFrame(np.random.choice(a=['BIG', 'STILL_BIG'], size=(200,3)), columns=['x1', 'x2', 'x3'])
# df = df.append({'x1': 'bla', 'x2': 'BIG', 'x3': 1}, ignore_index=True)
# df = df.append({'x1': 'bla', 'x2': 'BIG', 'x3': 1}, ignore_index=True)
# df = df.append({'x1': 'bla2', 'x2': 'BIG', 'x3': 2}, ignore_index=True)

In [4]:
datelist = pd.date_range(start=datetime.today(), periods=200, freq='D', normalize=True)
s_date = pd.Series([d.strftime('%Y-%m-%d') for d in datelist[:197]] + [d.strftime('%Y-%b-%d') for d in datelist[197:]], name='date')
emaillist = [''.join(np.random.choice(a=list('abcdefghijklmnopqrstuvwxyz'), p=[1/26]*26, size=np.random.choice(a=[6,7,8], p=[0.2, 0.5, 0.3]))) + '@gmail.com' for x in range(198)]
emaillist.extend(['myname@gmail.com1', 'myname@gmail.co'])
s_email = pd.Series(emaillist, name='email')

df = pd.DataFrame([s_date, s_email]).T

In [5]:
df

Unnamed: 0,date,email
0,2021-10-17,ytpeebw@gmail.com
1,2021-10-18,sazvfee@gmail.com
2,2021-10-19,nlhpdhj@gmail.com
3,2021-10-20,ufnpbpe@gmail.com
4,2021-10-21,yzvhcr@gmail.com
...,...,...
195,2022-04-30,zdsklss@gmail.com
196,2022-05-01,dcsphccx@gmail.com
197,2022-May-02,ifjbnb@gmail.com
198,2022-May-03,myname@gmail.com1


# Run Check

In [6]:
ds = Dataset(df)
check = RareFormatDetection()

check.run(dataset=ds)


  self._features = [x for x in df.columns if x not in {label, index, date}]
  self._cat_features = self.infer_categorical_features()


Unnamed: 0,digits only format (ignoring letters),sequences of digits only format (ignoring letters),sequences of letters only format (ignoring letters),digits and letters format,digits and letters format (case sensitive),any sequence format
ratio of rare patterns to common patterns,1.52%,1.52%,1.52%,1.52%,1.52%,1.52%
common formats,[0000-00-00],[____-__-__],[--],[0000-00-00],[0000-00-00],[0000-00-00]
examples for values in common formats,[2021-10-18],[2021-10-18],[2021-10-18],[2021-10-18],[2021-10-18],[2021-10-18]
values in rare formats,"[2022-May-02, 2022-May-03, 2022-May-04]","[2022-May-02, 2022-May-03, 2022-May-04]","[2022-May-02, 2022-May-03, 2022-May-04]","[2022-May-02, 2022-May-03, 2022-May-04]","[2022-May-02, 2022-May-03, 2022-May-04]","[2022-May-02, 2022-May-03, 2022-May-04]"

Unnamed: 0,digits only format (ignoring letters),sequences of digits only format (ignoring letters),digits and letters format,digits and letters format (case sensitive),any sequence format
ratio of rare patterns to common patterns,0.50%,0.50%,1.01%,1.01%,0.50%
common formats,[@.],[@.],"[XXXXXXX@XXXXX.XXX, XXXXXXXX@XXXXX.XXX, XXXXXX@XXXXX.XXX]","[xxxxxxx@xxxxx.xxx, xxxxxxxx@xxxxx.xxx, xxxxxx@xxxxx.xxx]",[SEQ@SEQ.SEQ]
examples for values in common formats,[sazvfee@gmail.com],[sazvfee@gmail.com],"[sazvfee@gmail.com, qxmdstou@gmail.com, vssubj@gmail.com]","[sazvfee@gmail.com, qxmdstou@gmail.com, vssubj@gmail.com]",[sazvfee@gmail.com]
values in rare formats,[myname@gmail.com1],[myname@gmail.com1],"[myname@gmail.com1, myname@gmail.co]","[myname@gmail.com1, myname@gmail.co]",[myname@gmail.com1]
