In [None]:
# default_exp detector

In [None]:
# hide
from nbdev.showdoc import *

In [None]:
# export
import numpy as np
import pandas as pd

from typing import Any, Dict, Callable
from tqdm import tqdm

from forma.judge import FormatJudge
from forma.utils import PatternGenerator

# Detector

> Detects format errors on a tubular dataset.

In [None]:
# export
class FormatDetector:
    def __init__(self, skip: list = None):
        self.skip = skip
    
    def fit(self, df: pd.DataFrame, generator: PatternGenerator or List['str': PatternGenerator], 
            n: int = 3, dim: int = 1):
        self.judges = {}
        
        self.df = df
        with tqdm(total=len(self.df.columns)) as pbar:
            if isinstance(generator, PatternGenerator):
                for col in self.df.columns:
                    if col in self.skip:
                        continue
                    col_values = self.df[col].tolist()
                    format_judge = FormatJudge(generator, n, dim)
                    format_judge.fit(col_values)
                    self.judges[col] = format_judge
                    pbar.update(1)
            else:
                for col in self.df.columns:
                    if col in self.skip:
                        continue
                    col_values = self.df[col].tolist()
                    gen = generator.get(col, PatternGenerator())
                    format_judge = FormatJudge(gen, n, dim)
                    format_judge.fit(col_values)
                    self.judges[col] = format_judge
                    pbar.update(1)
    
    def detect(self, reduction: Callable = np.min, softmax: bool = True) -> dict:
        scores = []
        
        with tqdm(total=len(self.df)) as pbar:
            for index, row in self.df.iterrows():
                tuple_score = []
                for col in self.df.columns:
                    if col in self.skip:
                        continue
                    judge = self.judges[col]
                    score = np.mean(judge(row[col]))
                    tuple_score.append(score)
                if softmax:
                    tuple_score = np.exp(tuple_score)
                    softmax_tuple_score = [score / sum(tuple_score) for score in tuple_score]
                    if reduction == np.ptp:
                        scores.append(reduction(softmax_tuple_score))
                    else:
                        scores.append(1 - reduction(softmax_tuple_score))
                else:
                    if reduction == np.ptp:
                        scores.append(reduction(tuple_score))
                    else:
                        scores.append(1 - reduction(tuple_score))
                pbar.update(1)
        assessed_df = self.df.copy()
        assessed_df['p'] = scores
        return assessed_df

In [None]:
a = [.02, .99]

In [None]:
np.ptp(a)

0.97

In [None]:
b = np.exp(a)

In [None]:
c = [e/sum(b) for e in b]

In [None]:
d = np.std(c)

In [None]:
d

0.2251194977898231