-
Notifications
You must be signed in to change notification settings - Fork 699
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
UPDATE: identifier_column code refactored
- Loading branch information
1 parent
49213e7
commit ce2fb90
Showing
6 changed files
with
107 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
49 changes: 49 additions & 0 deletions
49
cleanlab/datalab/internal/issue_manager/identifier_column.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from typing import Tuple, Optional | ||
|
||
from cleanlab.datalab.internal.issue_manager import IssueManager | ||
|
||
|
||
class IdentifierIssueManager(IssueManager): | ||
"""An issue manager that keeps track of issues related to identifier | ||
columns in tabular datasets. | ||
Parameters: | ||
-------------- | ||
datalab: | ||
The Datalab instance that this issue manager searches for issues in. | ||
""" | ||
|
||
description: str = "Identifies columns with sequential integers." | ||
issue_name: str = "identifier_column" | ||
verbosity_levels = { | ||
0: [], | ||
1: [], | ||
2: [], | ||
} | ||
|
||
@staticmethod | ||
def _identifier_column(features: np.ndarray) -> Tuple[int, Optional[int]]: | ||
num_rows, num_columns = features.shape | ||
|
||
for i in range(num_columns): | ||
unique_values = np.unique(features[:, i]) | ||
|
||
## logic for identifying column with consecutive sequential integer numbers. | ||
if np.array_equal( | ||
unique_values, np.arange(unique_values[0], unique_values[0] + num_rows) | ||
): | ||
return 0, i # Found a column that meets the condition | ||
|
||
return 1, None # No such column found | ||
|
||
def find_issues(self, features: np.ndarray = None) -> None: | ||
if features is None: | ||
raise ValueError("Features must be provided to check for issues") | ||
|
||
score, column_index = self._identifier_column(features) | ||
|
||
self.summary = self.make_summary(score=score) | ||
|
||
self.info = {"identifier_column": [column_index] if column_index is not None else []} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
from cleanlab.datalab.internal.issue_manager.identifier_column import IdentifierIssueManager | ||
|
||
SEED = 42 | ||
|
||
|
||
class TestIdentifierIssueManager: | ||
@pytest.fixture | ||
def embeddings(self): | ||
np.random.seed(SEED) | ||
embedding_array = np.random.random((4, 3)) | ||
return embedding_array | ||
|
||
@pytest.fixture | ||
def embeddings_with_id(self): | ||
np.random.seed(SEED) | ||
embedding_array = np.random.random((4, 3)) | ||
embedding_array[:, 0] = np.arange(4) | ||
return embedding_array | ||
|
||
@pytest.fixture | ||
def issue_manager(self, lab): | ||
return IdentifierIssueManager(datalab=lab) | ||
|
||
def test_init(self, lab, issue_manager): | ||
assert issue_manager.datalab == lab | ||
|
||
def test_find_issues(self, issue_manager, embeddings): | ||
np.random.seed(SEED) | ||
issue_manager.find_issues(features=embeddings) | ||
summary_sort, info_sort = ( | ||
issue_manager.summary, | ||
issue_manager.info, | ||
) | ||
assert summary_sort["issue_type"][0] == "identifier_column" | ||
assert summary_sort["score"][0] == 1 | ||
assert info_sort.get("identifier_column") == [] | ||
|
||
def test_find_issues_with_id(self, issue_manager, embeddings_with_id): | ||
np.random.seed(SEED) | ||
issue_manager.find_issues(features=embeddings_with_id) | ||
summary_sort, info_sort = ( | ||
issue_manager.summary, | ||
issue_manager.info, | ||
) | ||
assert summary_sort["issue_type"][0] == "identifier_column" | ||
assert summary_sort["score"][0] == 0 | ||
assert info_sort.get("identifier_column") == [0] |