Skip to content

Commit

Permalink
Noam/bugfix/nlp various fixes
Browse files Browse the repository at this point in the history
* don't lock down version

* avoid warning that is now always raised

* fix special char print

* avoid various edge cases in weak segments

* make plot prettier when there are many words

* fix property sampling

* update certify allowed licence
  • Loading branch information
noamzbr committed May 8, 2023
1 parent 0e011b0 commit f9bf86d
Show file tree
Hide file tree
Showing 8 changed files with 19 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Expand Up @@ -128,7 +128,7 @@ jobs:
with:
requirements: 'requirements-all.txt'
fail: 'Copyleft,Other,Error'
exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2022\.12\.7|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)'
exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2023\.5\.7|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)'
# pyzmq is Revised BSD https://github.com/zeromq/pyzmq/blob/main/examples/LICENSE
# debugpy is MIT https://github.com/microsoft/debugpy/blob/main/LICENSE
# certifi is MPL-2.0 https://github.com/certifi/python-certifi/blob/master/LICENSE
Expand Down
2 changes: 1 addition & 1 deletion deepchecks/nlp/checks/data_integrity/special_characters.py
Expand Up @@ -143,7 +143,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
return CheckResult(
value=result_value,
display=[
f'List of ignored special characters: {self.special_characters_whitelist}',
f'List of ignored special characters: {list(self.special_characters_whitelist)}',
message,
display_table.iloc[:self.n_most_common]
]
Expand Down
4 changes: 3 additions & 1 deletion deepchecks/nlp/checks/data_integrity/unknown_tokens.py
Expand Up @@ -186,7 +186,9 @@ def create_pie_chart(self, all_unknown_words_counter, total_words):
fig.update_layout(title=f'Words containing Unknown Tokens - {self.tokenizer.name_or_path} Tokenizer<br>'
f'({format_percent(sum(percentages) / 100.)} of all words)',
title_x=0.5,
legend_title='Words with Unknown Tokens')
title_y=0.95,
legend_title='Words with Unknown Tokens',
margin=dict(l=0, r=0, t=100, b=0))

return fig

Expand Down
8 changes: 5 additions & 3 deletions deepchecks/nlp/context.py
Expand Up @@ -94,9 +94,11 @@ def __init__(self,
if set(train.get_original_text_indexes()) & set(test.get_original_text_indexes()):
train._original_text_index = np.asarray([f'train-{i}' for i in train.get_original_text_indexes()])
test._original_text_index = np.asarray([f'test-{i}' for i in test.get_original_text_indexes()])
get_logger().warning('train and test datasets have common index - adding "train"/"test"'
' prefixes. To avoid that provide datasets with no common indexes '
'or pass the model object instead of the predictions.')
# # This is commented out as currently text data indices are len(range(len(data)))
# # TODO: Uncomment when text data indices are not len(range(len(data)))
# get_logger().warning('train and test datasets have common index - adding "train"/"test"'
# ' prefixes. To avoid that provide datasets with no common indexes '
# 'or pass the model object instead of the predictions.')

for dataset, y_pred, y_proba in zip([train, test],
[y_pred_train, y_pred_test],
Expand Down
2 changes: 1 addition & 1 deletion deepchecks/nlp/utils/weak_segments.py
Expand Up @@ -39,7 +39,7 @@ def get_relevant_data_table(text_data: TextData, data_type: str, columns: Union[

if n_top_features is not None and n_top_features < features.shape[1]:
_warn_n_top_columns(data_type, n_top_features)
features = features.iloc[:, np.random.choice(features.shape[1], n_top_features)]
features = features.iloc[:, np.random.choice(features.shape[1], n_top_features, replace=False)]

return features, cat_features

Expand Down
7 changes: 5 additions & 2 deletions deepchecks/utils/abstracts/weak_segment_abstract.py
Expand Up @@ -179,8 +179,10 @@ def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,

weak_segments = pd.DataFrame(
columns=[score_title, 'Feature1', 'Feature1 Range', 'Feature2', 'Feature2 Range', '% of Data'])
for i in range(min(len(feature_rank_for_search), self.n_top_features)):
for j in range(i + 1, min(len(feature_rank_for_search), self.n_top_features)):
n_features = min(len(feature_rank_for_search), self.n_top_features) if self.n_top_features is not None\
else len(feature_rank_for_search)
for i in range(n_features):
for j in range(i + 1, n_features):
feature1, feature2 = feature_rank_for_search[[i, j]]
weak_segment_score, weak_segment_filter = self._find_weak_segment(data, [feature1, feature2],
score_per_sample, label_col,
Expand Down Expand Up @@ -289,6 +291,7 @@ def _format_partition_vec_for_display(self, partition_vec: np.array, feature_nam

def _generate_check_result_value(self, weak_segments_df, cat_features: List[str], avg_score: float):
"""Generate a uniform format check result value for the different WeakSegmentsPerformance checks."""
pd.set_option('mode.chained_assignment', None)
weak_segments_output = weak_segments_df.copy()
for idx, segment in weak_segments_df.iterrows():
for feature in ['Feature1', 'Feature2']:
Expand Down
2 changes: 1 addition & 1 deletion requirements/nlp-requirements.txt
@@ -1,3 +1,3 @@
seqeval>=1.0.0
nltk>=3.4.0,<=3.6.7
textblob==0.17.1
textblob>=0.17.1
3 changes: 2 additions & 1 deletion spelling-allowlist.txt
Expand Up @@ -148,4 +148,5 @@ tokenizer
nltk
Tokenize
spacy
tokenizers
tokenizers
Uncomment

0 comments on commit f9bf86d

Please sign in to comment.