In [None]:
# default_exp detector

In [None]:
# hide
from nbdev.showdoc import *

In [None]:
# export
import numpy as np
import pandas as pd

from typing import Any, Dict, Callable
from tqdm import tqdm

from forma.judge import FormatJudge
from forma.utils import PatternGenerator

# Detector

> Detects format errors on a tubular dataset.

In [None]:
# export
class FormatDetector:
    def __init__(self):
        pass
    
    def fit(self, df: pd.DataFrame, generator: PatternGenerator or List['str': PatternGenerator], 
            n: int = 3, dim: int = 1):
        self.judges = {}
        
        self.df = df
        with tqdm(total=len(self.df.columns)) as pbar:
            if isinstance(generator, PatternGenerator):
                for col in self.df.columns:
                    col_values = self.df[col].tolist()
                    format_judge = FormatJudge(generator, n, dim)
                    format_judge.fit(col_values)
                    self.judges[col] = format_judge
                    pbar.update(1)
            else:
                for col, gen in generator.items():
                    col_values = self.df[col].tolist()
                    format_judge = FormatJudge(gen, n, dim)
                    format_judge.fit(col_values)
                    self.judges[col] = format_judge
                    pbar.update(1)
    
    def detect(self, reduction: Callable = np.min) -> dict:
        scores = []
        
        with tqdm(total=len(self.df)) as pbar:
            for index, row in self.df.iterrows():
                tuple_score = []
                for col in self.df.columns:
                    judge = self.judges[col]
                    tuple_score.append(np.mean(judge(row[col])))
                scores.append(1 - reduction(tuple_score))
                pbar.update(1)
        assessed_df = self.df.copy()
        assessed_df['p'] = scores
        return assessed_df