You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi and thank you for creating the package!
I am exploring its applicability on a data set and I run into an error.
The data that I am using come from an ERP, so the user can insert whatever he wants sometimes erroneous data.
I identified that when all words lengths within the string to be compared are equal to 1 I get an error on the MinHashSampler when trying to fit.
I reproduced the error on an artificial dataset please see below:
If I remove the ['k i'] row it runs without errors.
The error occurs when the MinHashSampler is called but I am not sure exactly what the function does and how to correct that.
I suppose that I could perform a check, for example counting the length of words of each row and omitting them, before calling the function but I wanted to check with you if you have any suggestion and recipes?
Hi and thank you for creating the package!
I am exploring its applicability on a data set and I run into an error.
The data that I am using come from an ERP, so the user can insert whatever he wants sometimes erroneous data.
I identified that when all words lengths within the string to be compared are equal to 1 I get an error on the MinHashSampler when trying to fit.
I reproduced the error on an artificial dataset please see below:
If I remove the ['k i'] row it runs without errors.
The error occurs when the MinHashSampler is called but I am not sure exactly what the function does and how to correct that.
I suppose that I could perform a check, for example counting the length of words of each row and omitting them, before calling the function but I wanted to check with you if you have any suggestion and recipes?
Thank you very much in advance,
ValueError Traceback (most recent call last)
File ~/miniconda2/envs/py395dd/lib/python3.9/site-packages/deduplipy/deduplicator/deduplicator.py:136, in Deduplicator.fit(self, X, n_samples)
]()124 def fit(self, X: pd.DataFrame, n_samples: int = 10_000) -> 'Deduplicator':
125 """
126 Fit the deduplicator instance
127
(...)
134
135 """
--> 136 pairs_table = self._create_pairs_table(X, n_samples)
137 similarities = self._calculate_string_similarities(pairs_table)
138[ self.myActiveLearner.fit(similarities)
File ~/miniconda2/envs/py395dd/lib/python3.9/site-packages/deduplipy/deduplicator/deduplicator.py:105, in Deduplicator._create_pairs_table(self, X, n_samples)
]()93 """
94 Create sample of pairs
95
(...)
102
103 """
104 n_samples_minhash = n_samples // 2
--> 105 minhash_pairs = MinHashSampler(self.col_names).sample(X, n_samples_minhash)
106 # the number of minhash samples can be (much) smaller than n_samples//2, in such case take more random pairs:
107[ n_samples_naive = n_samples - len(minhash_pairs)
File ~/miniconda2/envs/py395dd/lib/python3.9/site-packages/deduplipy/sampling/minhash_sampling.py:128, in MinHashSampler.sample(self, X, n_samples, threshold)
]()114 def sample(self, X: pd.DataFrame, n_samples: int, threshold: float = 0.2) -> pd.DataFrame:
115 """
116 Method to draw sample of pairs of size
n_samples
from dataframe X. Note thatn_samples
cannot be returned if117 the number of pairs above the threshold is too low.
(...)
126
127 """
--> 128 minhash_pairs = self._create_minhash_pairs(X, threshold)
130 stratified_sample = self._get_stratified_sample(minhash_pairs, n_samples)
132[ non_stratified_sample = self._get_non_stratified_sample(minhash_pairs, stratified_sample, n_samples)
File ~/miniconda2/envs/py395dd/lib/python3.9/site-packages/deduplipy/sampling/minhash_sampling.py:49, in MinHashSampler._create_minhash_pairs(self, X, threshold)
]()47 minhash_pairs = pd.DataFrame()
48 for col in self.col_names:
---> 49 minhash_result = self.MinHasher.fit_predict(df, col)
51 # add other columns than the one used for minhashing
52 minhash_result = (minhash_result
53 .merge(df.drop(columns=[col]), left_on='row_number_1', right_on='row_number')
54[ .drop(columns=['row_number']))
File ~/miniconda2/envs/py395dd/lib/python3.9/site-packages/pyminhash/pyminhash.py:154, in MinHash.fit_predict(self, df, col_name)
]()152 df_['row_number'] = np.arange(len(df_))
153 df_ = self.sparse_vectorize(df, col_name)
--> 154 df_ = self.create_minhash_signatures(df)
155[ return self.create_pairs(df, col_name)
File ~/miniconda2/envs/py395dd/lib/python3.9/site-packages/pyminhash/pyminhash.py:88, in MinHash._create_minhash_signatures(self, df)
]()76 def _create_minhash_signatures(self, df: pd.DataFrame) -> pd.DataFrame:
77 """
78 Apply minhashing to the column
sparse_vector
in Pandas dataframedf
in the new columnminhash_signature
.79 In addition, one column (e.g.: 'hash_{0}') per hash table is created.
(...)
86
87 """
---> 88 df['minhash_signature'] = df['sparse_vector'].apply(self._create_minhash)
89 # the following involved way of creating 'hash_' columns prevents efficiency warnings
90[ hash_df = df['minhash_signature'].apply(pd.Series)
File ~/miniconda2/envs/py395dd/lib/python3.9/site-packages/pandas/core/series.py:4433, in Series.apply(self, func, convert_dtype, args, **kwargs)
]()4323 def apply(
4324 self,
4325 func: AggFuncType,
(...)
4328 **kwargs,
4329 ) -> DataFrame | Series:
4330 """
4331 Invoke function on values of Series.
4332
(...)
4431 dtype: float64
4432 """
-> 4433[ return SeriesApply(self, func, convert_dtype, args, kwargs).apply()
File ~/miniconda2/envs/py395dd/lib/python3.9/site-packages/pandas/core/apply.py:1082, in SeriesApply.apply(self)
]()1078 if isinstance(self.f, str):
1079 # if we are a string, try to dispatch
1080 return self.apply_str()
-> 1082[ return self.apply_standard()
File ~/miniconda2/envs/py395dd/lib/python3.9/site-packages/pandas/core/apply.py:1137, in SeriesApply.apply_standard(self)
]()1131 values = obj.astype(object)._values
1132 # error: Argument 2 to "map_infer" has incompatible type
1133 # "UnionCallable[..., Any], str, List[Union[Callable[..., Any], str]],
1134[ # Dict[Hashable, UnionUnion[Callable[..., Any], str],
1135[ # List[Union[Callable[..., Any], str]]]]]"; expected
1136 # "Callable[[Any], Any]"
-> 1137 mapped = lib.map_infer(
1138 values,
1139 f, # type: ignore[arg-type]
1140 convert=self.convert_dtype,
1141 )
1143 if len(mapped) and isinstance(mapped[0], ABCSeries):
1144 # GH#43986 Need to do list(mapped) in order to get treated as nested
1145 # See also GH#25959 regarding EA support
1146[ return obj._constructor_expanddim(list(mapped), index=obj.index)
File ~/miniconda2/envs/py395dd/lib/python3.9/site-packages/pandas/_libs/lib.pyx:2870, in pandas._libs.lib.map_infer()
File ~/miniconda2/envs/py395dd/lib/python3.9/site-packages/pyminhash/pyminhash.py:73, in MinHash._create_minhash(self, doc)
]()71 hashes += self.b
72 hashes %= self.next_prime
---> 73 minhashes = hashes.min(axis=0)
74[ return minhashes
File ~/miniconda2/envs/py395dd/lib/python3.9/site-packages/numpy/core/_methods.py:44, in _amin(a, axis, out, keepdims, initial, where)
]()42 def _amin(a, axis=None, out=None, keepdims=False,
43 initial=_NoValue, where=True):
---> 44[ return umr_minimum(a, axis, None, out, keepdims, initial, where)
ValueError: zero-size array to reduction operation minimum which has no identity]()
The text was updated successfully, but these errors were encountered: