# Generate Data

In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
from mlchecks import Dataset
from mlchecks.checks.integrity.rare_format_detection import RareFormatDetection


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
np.random.seed(42)

In [4]:
datelist = pd.date_range(start=datetime.today(), periods=200, freq='D', normalize=True)
s_date = pd.Series([d.strftime('%Y-%m-%d') for d in datelist], name='date')

emaillist = [''.join(np.random.choice(a=list('abcdefghijklmnopqrstuvwxyz'), p=[1/26]*26, size=np.random.choice(a=[6,7,8], p=[0.2, 0.5, 0.3]))) + '@gmail.com' for x in range(200)]
s_email = pd.Series(emaillist, name='email')

df = pd.DataFrame([s_date, s_email]).T

In [5]:
df.head()

Unnamed: 0,date,email
0,2021-11-04,ytpeebw@gmail.com
1,2021-11-05,sazvfee@gmail.com
2,2021-11-06,nlhpdhj@gmail.com
3,2021-11-07,ufnpbpe@gmail.com
4,2021-11-08,yzvhcr@gmail.com


# Run Check - Find no changes

In [6]:
ds = Dataset(df)
check = RareFormatDetection()

check.run(dataset=ds)

# Induce changes to data

In [7]:
# Change dates to different format:
df['date'].loc[0:2] = [datetime.strptime(d, '%Y-%m-%d').strftime('%Y-%b-%d') for d in df['date'].loc[0:2]]

# Change emails to have mistakes in format
df['email'].loc[[0,1]] = ['myname@gmail.com1', 'myname@gmail.co']

In [8]:
df.head()

Unnamed: 0,date,email
0,2021-Nov-04,myname@gmail.com1
1,2021-Nov-05,myname@gmail.co
2,2021-Nov-06,nlhpdhj@gmail.com
3,2021-11-07,ufnpbpe@gmail.com
4,2021-11-08,yzvhcr@gmail.com


# Run Check - Find rare formats

In [9]:
ds = Dataset(df)
check = RareFormatDetection()

check.run(dataset=ds)

Unnamed: 0,digits and letters format (case sensitive)
ratio of rare patterns (out of all patterns,1.50% (3)
common formats,['2020-00-00']
examples for values in common formats,['2021-11-07']
values in rare formats,"['2021-Nov-04', '2021-Nov-05', '2021-Nov-06']"


Unnamed: 0,digits and letters format (case sensitive)
ratio of rare patterns (out of all patterns,1.00% (2)
common formats,"['xxxxxxx@gmail.com', 'xxxxxxxx@gmail.com', 'xxxxxx@gmail.com']"
examples for values in common formats,"['nlhpdhj@gmail.com', 'cfbikhvj@gmail.com', 'yzvhcr@gmail.com']"
values in rare formats,"['myname@gmail.com1', 'myname@gmail.co']"


# test - to remove!

In [10]:
from urllib.request import urlopen

In [11]:
name_data = urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names')
lines = [l.decode("utf-8") for l in name_data if ':' in l.decode("utf-8") and '|' not in l.decode("utf-8")]

features = [l.split(':')[0] for l in lines]
label_name = 'income'

cat_features = [l.split(':')[0] for l in lines if 'continuous' not in l]

train_df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                       names=features + [label_name])
val_df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',
                      names=features + [label_name], skiprows=1)

In [12]:
ds = Dataset(train_df)
check = RareFormatDetection()

check.run(dataset=ds)

In [13]:
train_df['capital-loss'].value_counts()

0       31042
1902      202
1977      168
1887      159
1848       51
        ...  
2080        1
1539        1
1844        1
2489        1
1411        1
Name: capital-loss, Length: 92, dtype: int64

In [14]:
df = pd.DataFrame({'x': [3,3,5,7,7], 'y': [0,1,2,3,4]})
df.groupby('x').head(1)

Unnamed: 0,x,y
0,3,0
2,5,2
3,7,3
