Skip to content

Commit

Permalink
UPDATE: identifier_column code refactored
Browse files Browse the repository at this point in the history
  • Loading branch information
01PrathamS committed Jan 18, 2024
1 parent 49213e7 commit ce2fb90
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 1 deletion.
5 changes: 4 additions & 1 deletion cleanlab/datalab/internal/data_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,10 @@ def collect_issues_from_issue_manager(self, issue_manager: IssueManager) -> None
self.issue_summary["issue_type"] != issue_manager.issue_name
]
issue_column_name: str = f"is_{issue_manager.issue_name}_issue"
num_issues: int = int(issue_manager.issues[issue_column_name].sum())
if not issue_manager.issues.empty:
num_issues: int = int(issue_manager.issues[issue_column_name].sum())
else:
num_issues = 1 if (issue_manager.summary.score).any() == 0 else 0
self.issue_summary = pd.concat(
[
self.issue_summary,
Expand Down
1 change: 1 addition & 0 deletions cleanlab/datalab/internal/issue_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
"data_valuation": ["knn_graph"],
"class_imbalance": [],
"null": ["features"],
"identifier_column": ["features"],
}
_REGRESSION_ARGS_DICT = {"label": ["features", "predictions"]}

Expand Down
1 change: 1 addition & 0 deletions cleanlab/datalab/internal/issue_manager/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
from .underperforming_group import UnderperformingGroupIssueManager
from .data_valuation import DataValuationIssueManager
from .null import NullIssueManager
from .identifier_column import IdentifierIssueManager
49 changes: 49 additions & 0 deletions cleanlab/datalab/internal/issue_manager/identifier_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import numpy as np
import pandas as pd
from typing import Tuple, Optional

from cleanlab.datalab.internal.issue_manager import IssueManager


class IdentifierIssueManager(IssueManager):
"""An issue manager that keeps track of issues related to identifier
columns in tabular datasets.
Parameters:
--------------
datalab:
The Datalab instance that this issue manager searches for issues in.
"""

description: str = "Identifies columns with sequential integers."
issue_name: str = "identifier_column"
verbosity_levels = {
0: [],
1: [],
2: [],
}

@staticmethod
def _identifier_column(features: np.ndarray) -> Tuple[int, Optional[int]]:
num_rows, num_columns = features.shape

for i in range(num_columns):
unique_values = np.unique(features[:, i])

## logic for identifying column with consecutive sequential integer numbers.
if np.array_equal(
unique_values, np.arange(unique_values[0], unique_values[0] + num_rows)
):
return 0, i # Found a column that meets the condition

return 1, None # No such column found

def find_issues(self, features: np.ndarray = None) -> None:
if features is None:
raise ValueError("Features must be provided to check for issues")

score, column_index = self._identifier_column(features)

self.summary = self.make_summary(score=score)

self.info = {"identifier_column": [column_index] if column_index is not None else []}
2 changes: 2 additions & 0 deletions cleanlab/datalab/internal/issue_manager_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
DataValuationIssueManager,
OutlierIssueManager,
NullIssueManager,
IdentifierIssueManager,
)
from cleanlab.datalab.internal.issue_manager.regression import RegressionLabelIssueManager

Expand All @@ -65,6 +66,7 @@
"underperforming_group": UnderperformingGroupIssueManager,
"data_valuation": DataValuationIssueManager,
"null": NullIssueManager,
"identifier_column": IdentifierIssueManager,
},
"regression": {"label": RegressionLabelIssueManager},
}
Expand Down
50 changes: 50 additions & 0 deletions tests/datalab/issue_manager/test_identifier_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import numpy as np
import pytest

from cleanlab.datalab.internal.issue_manager.identifier_column import IdentifierIssueManager

SEED = 42


class TestIdentifierIssueManager:
@pytest.fixture
def embeddings(self):
np.random.seed(SEED)
embedding_array = np.random.random((4, 3))
return embedding_array

@pytest.fixture
def embeddings_with_id(self):
np.random.seed(SEED)
embedding_array = np.random.random((4, 3))
embedding_array[:, 0] = np.arange(4)
return embedding_array

@pytest.fixture
def issue_manager(self, lab):
return IdentifierIssueManager(datalab=lab)

def test_init(self, lab, issue_manager):
assert issue_manager.datalab == lab

def test_find_issues(self, issue_manager, embeddings):
np.random.seed(SEED)
issue_manager.find_issues(features=embeddings)
summary_sort, info_sort = (
issue_manager.summary,
issue_manager.info,
)
assert summary_sort["issue_type"][0] == "identifier_column"
assert summary_sort["score"][0] == 1
assert info_sort.get("identifier_column") == []

def test_find_issues_with_id(self, issue_manager, embeddings_with_id):
np.random.seed(SEED)
issue_manager.find_issues(features=embeddings_with_id)
summary_sort, info_sort = (
issue_manager.summary,
issue_manager.info,
)
assert summary_sort["issue_type"][0] == "identifier_column"
assert summary_sort["score"][0] == 0
assert info_sort.get("identifier_column") == [0]

0 comments on commit ce2fb90

Please sign in to comment.