Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Noam/bugfix/nlp fixes various #2501

Merged
merged 9 commits into from
May 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ jobs:
with:
requirements: 'requirements-all.txt'
fail: 'Copyleft,Other,Error'
exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2022\.12\.7|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)'
exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2023\.5\.7|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)'
# pyzmq is Revised BSD https://github.com/zeromq/pyzmq/blob/main/examples/LICENSE
# debugpy is MIT https://github.com/microsoft/debugpy/blob/main/LICENSE
# certifi is MPL-2.0 https://github.com/certifi/python-certifi/blob/master/LICENSE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
return CheckResult(
value=result_value,
display=[
f'List of ignored special characters: {self.special_characters_whitelist}',
f'List of ignored special characters: {list(self.special_characters_whitelist)}',
message,
display_table.iloc[:self.n_most_common]
]
Expand Down
4 changes: 3 additions & 1 deletion deepchecks/nlp/checks/data_integrity/unknown_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,9 @@ def create_pie_chart(self, all_unknown_words_counter, total_words):
fig.update_layout(title=f'Words containing Unknown Tokens - {self.tokenizer.name_or_path} Tokenizer<br>'
f'({format_percent(sum(percentages) / 100.)} of all words)',
title_x=0.5,
legend_title='Words with Unknown Tokens')
title_y=0.95,
legend_title='Words with Unknown Tokens',
margin=dict(l=0, r=0, t=100, b=0))

return fig

Expand Down
8 changes: 5 additions & 3 deletions deepchecks/nlp/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,11 @@ def __init__(self,
if set(train.get_original_text_indexes()) & set(test.get_original_text_indexes()):
train._original_text_index = np.asarray([f'train-{i}' for i in train.get_original_text_indexes()])
test._original_text_index = np.asarray([f'test-{i}' for i in test.get_original_text_indexes()])
get_logger().warning('train and test datasets have common index - adding "train"/"test"'
' prefixes. To avoid that provide datasets with no common indexes '
'or pass the model object instead of the predictions.')
# # This is commented out as currently text data indices are len(range(len(data)))
# # TODO: Uncomment when text data indices are not len(range(len(data)))
# get_logger().warning('train and test datasets have common index - adding "train"/"test"'
# ' prefixes. To avoid that provide datasets with no common indexes '
# 'or pass the model object instead of the predictions.')

for dataset, y_pred, y_proba in zip([train, test],
[y_pred_train, y_pred_test],
Expand Down
2 changes: 1 addition & 1 deletion deepchecks/nlp/utils/weak_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def get_relevant_data_table(text_data: TextData, data_type: str, columns: Union[

if n_top_features is not None and n_top_features < features.shape[1]:
noamzbr marked this conversation as resolved.
Show resolved Hide resolved
_warn_n_top_columns(data_type, n_top_features)
features = features.iloc[:, np.random.choice(features.shape[1], n_top_features)]
features = features.iloc[:, np.random.choice(features.shape[1], n_top_features, replace=False)]

return features, cat_features

Expand Down
7 changes: 5 additions & 2 deletions deepchecks/utils/abstracts/weak_segment_abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,10 @@ def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,

weak_segments = pd.DataFrame(
columns=[score_title, 'Feature1', 'Feature1 Range', 'Feature2', 'Feature2 Range', '% of Data'])
for i in range(min(len(feature_rank_for_search), self.n_top_features)):
for j in range(i + 1, min(len(feature_rank_for_search), self.n_top_features)):
n_features = min(len(feature_rank_for_search), self.n_top_features) if self.n_top_features is not None\
else len(feature_rank_for_search)
for i in range(n_features):
for j in range(i + 1, n_features):
feature1, feature2 = feature_rank_for_search[[i, j]]
weak_segment_score, weak_segment_filter = self._find_weak_segment(data, [feature1, feature2],
score_per_sample, label_col,
Expand Down Expand Up @@ -289,6 +291,7 @@ def _format_partition_vec_for_display(self, partition_vec: np.array, feature_nam

def _generate_check_result_value(self, weak_segments_df, cat_features: List[str], avg_score: float):
"""Generate a uniform format check result value for the different WeakSegmentsPerformance checks."""
pd.set_option('mode.chained_assignment', None)
weak_segments_output = weak_segments_df.copy()
for idx, segment in weak_segments_df.iterrows():
for feature in ['Feature1', 'Feature2']:
Expand Down
2 changes: 1 addition & 1 deletion requirements/nlp-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
seqeval>=1.0.0
nltk>=3.4.0,<=3.6.7
textblob==0.17.1
textblob>=0.17.1
3 changes: 2 additions & 1 deletion spelling-allowlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -148,4 +148,5 @@ tokenizer
nltk
Tokenize
spacy
tokenizers
tokenizers
Uncomment