Noam/bugfix/nlp various fixes

* don't lock down version * avoid warning that is now always raised * fix special char print * avoid various edge cases in weak segments * make plot prettier when there are many words * fix property sampling * update certify allowed licence
deepchecks · May 8, 2023 · f9bf86d · f9bf86d
1 parent 0e011b0
commit f9bf86d
Show file tree

Hide file tree

Showing 8 changed files with 19 additions and 11 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -128,7 +128,7 @@ jobs:
       with:
         requirements: 'requirements-all.txt'
         fail: 'Copyleft,Other,Error'
-        exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2022\.12\.7|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)'
+        exclude: '(pyzmq.*23\.2\.1|debugpy.*1\.6\.7|certifi.*2023\.5\.7|tqdm.*4\.65\.0|webencodings.*0\.5\.1|torch.*1\.10\.2.*|torchvision.*0\.11\.3.*|terminado.*0\.15\.0.*|urllib3.*1\.26\.11.*|imageio.*2\.20\.0.*|jsonschema.*4\.8\.0.*|qudida.*0\.0\.4)'
         # pyzmq is Revised BSD https://github.com/zeromq/pyzmq/blob/main/examples/LICENSE
         # debugpy is MIT https://github.com/microsoft/debugpy/blob/main/LICENSE
         # certifi is MPL-2.0 https://github.com/certifi/python-certifi/blob/master/LICENSE

diff --git a/deepchecks/nlp/checks/data_integrity/special_characters.py b/deepchecks/nlp/checks/data_integrity/special_characters.py
@@ -143,7 +143,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         return CheckResult(
             value=result_value,
             display=[
-                f'List of ignored special characters: {self.special_characters_whitelist}',
+                f'List of ignored special characters: {list(self.special_characters_whitelist)}',
                 message,
                 display_table.iloc[:self.n_most_common]
             ]

diff --git a/deepchecks/nlp/checks/data_integrity/unknown_tokens.py b/deepchecks/nlp/checks/data_integrity/unknown_tokens.py
@@ -186,7 +186,9 @@ def create_pie_chart(self, all_unknown_words_counter, total_words):
         fig.update_layout(title=f'Words containing Unknown Tokens - {self.tokenizer.name_or_path} Tokenizer<br>'
                                 f'({format_percent(sum(percentages) / 100.)} of all words)',
                           title_x=0.5,
-                          legend_title='Words with Unknown Tokens')
+                          title_y=0.95,
+                          legend_title='Words with Unknown Tokens',
+                          margin=dict(l=0, r=0, t=100, b=0))
 
         return fig
 

diff --git a/deepchecks/nlp/context.py b/deepchecks/nlp/context.py
@@ -94,9 +94,11 @@ def __init__(self,
             if set(train.get_original_text_indexes()) & set(test.get_original_text_indexes()):
                 train._original_text_index = np.asarray([f'train-{i}' for i in train.get_original_text_indexes()])
                 test._original_text_index = np.asarray([f'test-{i}' for i in test.get_original_text_indexes()])
-                get_logger().warning('train and test datasets have common index - adding "train"/"test"'
-                                     ' prefixes. To avoid that provide datasets with no common indexes '
-                                     'or pass the model object instead of the predictions.')
+                # # This is commented out as currently text data indices are len(range(len(data)))
+                # # TODO: Uncomment when text data indices are not len(range(len(data)))
+                # get_logger().warning('train and test datasets have common index - adding "train"/"test"'
+                #                      ' prefixes. To avoid that provide datasets with no common indexes '
+                #                      'or pass the model object instead of the predictions.')
 
         for dataset, y_pred, y_proba in zip([train, test],
                                             [y_pred_train, y_pred_test],

diff --git a/deepchecks/nlp/utils/weak_segments.py b/deepchecks/nlp/utils/weak_segments.py
@@ -39,7 +39,7 @@ def get_relevant_data_table(text_data: TextData, data_type: str, columns: Union[
 
     if n_top_features is not None and n_top_features < features.shape[1]:
         _warn_n_top_columns(data_type, n_top_features)
-        features = features.iloc[:, np.random.choice(features.shape[1], n_top_features)]
+        features = features.iloc[:, np.random.choice(features.shape[1], n_top_features, replace=False)]
 
     return features, cat_features
 

diff --git a/deepchecks/utils/abstracts/weak_segment_abstract.py b/deepchecks/utils/abstracts/weak_segment_abstract.py
@@ -179,8 +179,10 @@ def _weak_segments_search(self, data: pd.DataFrame, score_per_sample: pd.Series,
 
         weak_segments = pd.DataFrame(
             columns=[score_title, 'Feature1', 'Feature1 Range', 'Feature2', 'Feature2 Range', '% of Data'])
-        for i in range(min(len(feature_rank_for_search), self.n_top_features)):
-            for j in range(i + 1, min(len(feature_rank_for_search), self.n_top_features)):
+        n_features = min(len(feature_rank_for_search), self.n_top_features) if self.n_top_features is not None\
+            else len(feature_rank_for_search)
+        for i in range(n_features):
+            for j in range(i + 1, n_features):
                 feature1, feature2 = feature_rank_for_search[[i, j]]
                 weak_segment_score, weak_segment_filter = self._find_weak_segment(data, [feature1, feature2],
                                                                                   score_per_sample, label_col,
@@ -289,6 +291,7 @@ def _format_partition_vec_for_display(self, partition_vec: np.array, feature_nam
 
     def _generate_check_result_value(self, weak_segments_df, cat_features: List[str], avg_score: float):
         """Generate a uniform format check result value for the different WeakSegmentsPerformance checks."""
+        pd.set_option('mode.chained_assignment', None)
         weak_segments_output = weak_segments_df.copy()
         for idx, segment in weak_segments_df.iterrows():
             for feature in ['Feature1', 'Feature2']:

diff --git a/requirements/nlp-requirements.txt b/requirements/nlp-requirements.txt
@@ -1,3 +1,3 @@
 seqeval>=1.0.0
 nltk>=3.4.0,<=3.6.7
-textblob==0.17.1
+textblob>=0.17.1
diff --git a/spelling-allowlist.txt b/spelling-allowlist.txt
@@ -148,4 +148,5 @@ tokenizer
 nltk
 Tokenize
 spacy
-tokenizers
+tokenizers
+Uncomment