In [None]:
# hide
import numpy as np
import pandas as pd

from forma.judge import FormatJudge
from forma.detector import FormatDetector
from forma.utils import PatternGenerator

# Forma

> Automatic format error detection on tabular data.

Forma is an open-source library, written in python, that enables automatic and domain-agnostic format error detection on tabular data. The library is a by-product of the research project [BigDataStack](https://bigdatastack.eu/).

## Install

Run `pip install forma` to install the library in your environment.

## How to use

We will work with the the popular [movielens](https://grouplens.org/datasets/movielens/) dataset.

In [None]:
# local
# load the data
col_names = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings_df = pd.read_csv('../data/ratings.dat', delimiter='::', names=col_names, engine='python')

In [None]:
# local
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


Let us introduce some random mistakes.

In [None]:
# local
dirty_df = ratings_df.astype('str').copy()

dirty_df.iloc[3]['timestamp'] = '9783000275'
dirty_df.iloc[2]['movie_id'] = '914.'
dirty_df.iloc[4]['rating'] = '10'

Initialize the detector, fit and detect. The returned result is a pandas DataFrame with an extra column `p`, which records the probability of a format error being present in the row. We see that the probability for the tuples where we introduced random artificial mistakes is increased.

In [None]:
# local
# initialize detector
detector = FormatDetector()
# fit detector
generators = {'user_id': PatternGenerator(other='leaf'),
              'movie_id': PatternGenerator(other='leaf'),
              'rating': PatternGenerator(other='leaf'),
              'timestamp': PatternGenerator(other='leaf')}

detector.fit(dirty_df, generator=generators, n=3)
# detect error probability
assessed_df = detector.detect(reduction=np.mean)

# visualize results
assessed_df.head()

100%|██████████| 4/4 [00:00<00:00, 158.06it/s]


Unnamed: 0,user_id,movie_id,rating,timestamp,p
0,1,1193.0,5,978300760,0.0675
1,1,661.0,3,978302109,0.1975
2,1,914.0,3,978301968,0.24413
3,1,3408.0,4,9783000275,0.3125
4,1,2355.0,10,978824291,0.3125
