/
dataframes.py
271 lines (222 loc) · 8.85 KB
/
dataframes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Contain functions for handling dataframes in checks."""
import typing as t
import numpy as np
import pandas as pd
from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype, is_numeric_dtype
from deepchecks.core.errors import DeepchecksValueError
from deepchecks.utils.type_inference import infer_categorical_features
from deepchecks.utils.typing import Hashable
from deepchecks.utils.validation import ensure_hashable_or_mutable_sequence
__all__ = ['validate_columns_exist', 'select_from_dataframe', 'un_numpy', 'generalized_corrwith',
'floatify_dataframe', 'floatify_series', 'default_fill_na_per_column_type',
'is_float_column', 'default_fill_na_series',
'cast_categorical_to_object_dtype', 'hide_index_for_display']
def default_fill_na_per_column_type(df: pd.DataFrame, cat_features: t.Optional[t.Union[pd.Series, t.List]]) \
-> pd.DataFrame:
"""Fill NaN values per column type."""
pd.set_option('mode.chained_assignment', None)
if cat_features is None:
cat_features = infer_categorical_features(df)
result = {}
for col_name in df.columns:
modified_col = default_fill_na_series(df[col_name], col_name in cat_features)
if modified_col is not None:
result[col_name] = modified_col
return pd.DataFrame(result, index=df.index)
def default_fill_na_series(col: pd.Series, is_cat_column: t.Optional[bool] = None) -> t.Optional[pd.Series]:
"""Fill NaN values based on column type if possible otherwise returns None."""
if is_cat_column and 'None' not in col.astype('object').dropna().unique():
return col.astype('object').fillna('None')
elif is_numeric_dtype(col):
return col.astype('float64').fillna(np.nan)
common_values_list = col.mode()
if isinstance(common_values_list, pd.Series) and len(common_values_list) > 0:
return col.fillna(common_values_list[0])
return None
def floatify_dataframe(df: pd.DataFrame):
"""Return a dataframe where all the int columns are converted to floats.
Parameters
----------
df : pd.DataFrame
dataframe to convert
Raises
------
pd.DataFrame
the dataframe where all the int columns are converted to floats
"""
dtype_dict = df.dtypes.to_dict()
for col_name, dtype in dtype_dict.items():
if is_integer_dtype(dtype):
dtype_dict[col_name] = 'float'
return df.astype(dtype_dict)
def floatify_series(ser: pd.Series):
"""Return a series that if the type is int converted to float.
Parameters
----------
ser : pd.Series
series to convert
Raises
------
pd.Series
the converted series
"""
if is_integer_dtype(ser):
ser = ser.astype(float)
return ser
def un_numpy(val):
"""Convert numpy value to native value.
Parameters
----------
val :
The value to convert.
Returns
-------
returns the numpy value in a native type.
"""
if isinstance(val, np.str_):
# NOTE:
# 'np.str_' is instance of the 'np.generic' but
# 'np.isnan(np.str_())' raises an error with a next message:
# >> TypeError: ufunc 'isnan' not supported for the input types...)
#
# therefore this 'if' statement is needed
return val.item()
if isinstance(val, np.generic):
if np.isnan(val):
return None
return val.item()
if isinstance(val, np.ndarray):
return val.tolist()
return val
def validate_columns_exist(
df: pd.DataFrame,
columns: t.Union[Hashable, t.List[Hashable]],
raise_error: bool = True
) -> bool:
"""Validate given columns exist in dataframe.
Parameters
----------
df : pd.DataFrame
dataframe to inspect
columns : t.Union[Hashable, t.List[Hashable]]
Column names to check
raise_error : bool, default: True
whether to raise an error if some column is not present in the dataframe or not
Raises
------
DeepchecksValueError
If some of the columns do not exist within provided dataframe.
If receives empty list of 'columns'.
If not all elements within 'columns' list are hashable.
"""
error_message = 'columns - expected to receive not empty list of hashable values!'
columns = ensure_hashable_or_mutable_sequence(columns, message=error_message)
is_empty = len(columns) == 0
if raise_error and is_empty:
raise DeepchecksValueError(error_message)
elif not raise_error and is_empty:
return False
difference = set(columns) - set(df.columns)
all_columns_present = len(difference) == 0
if raise_error and not all_columns_present:
stringified_columns = ','.join(map(str, difference))
raise DeepchecksValueError(f'Given columns do not exist in dataset: {stringified_columns}')
return all_columns_present
def select_from_dataframe(
df: pd.DataFrame,
columns: t.Union[Hashable, t.List[Hashable], None] = None,
ignore_columns: t.Union[Hashable, t.List[Hashable], None] = None
) -> pd.DataFrame:
"""Filter DataFrame columns by given params.
Parameters
----------
df : pd.DataFrame
columns : t.Union[Hashable, t.List[Hashable]] , default: None
Column names to keep.
ignore_columns : t.Union[Hashable, t.List[Hashable]] , default: None
Column names to drop.
Returns
-------
pandas.DataFrame
returns horizontally filtered dataframe
Raises
------
DeepchecksValueError
If some columns do not exist within provided dataframe;
If 'columns' and 'ignore_columns' arguments are both not 'None'.
"""
if columns is not None and ignore_columns is not None:
raise DeepchecksValueError(
'Cannot receive both parameters "columns" and "ignore", '
'only one must be used at most'
)
elif columns is not None:
columns = ensure_hashable_or_mutable_sequence(columns)
validate_columns_exist(df, columns)
return t.cast(pd.DataFrame, df[columns])
elif ignore_columns is not None:
ignore_columns = ensure_hashable_or_mutable_sequence(ignore_columns)
validate_columns_exist(df, ignore_columns)
return df.drop(labels=ignore_columns, axis='columns')
else:
return df
def generalized_corrwith(x1: pd.DataFrame, x2: pd.DataFrame, method: t.Callable):
"""
Compute pairwise correlation.
Pairwise correlation is computed between columns of one DataFrame with columns of another DataFrame.
Pandas' method corrwith only applies when both dataframes have the same column names,
this generalized method applies to any two Dataframes with the same number of rows, regardless of the column names.
Parameters
----------
x1: DataFrame
Left data frame to compute correlations.
x2: Dataframe
Right data frame to compute correlations.
method: Callable
Method of correlation. callable with input two 1d ndarrays and returning a float.
Returns
-------
DataFrame
Pairwise correlations, the index matches the columns of x1 and the columns match the columns of x2.
"""
corr_results = x2.apply(lambda col: x1.corrwith(col, method=method))
return corr_results
def is_float_column(col: pd.Series) -> bool:
"""Check if a column must be a float - meaning does it contain fractions.
Parameters
----------
col : pd.Series
The column to check.
Returns
-------
bool
True if the column is float, False otherwise.
"""
if not is_float_dtype(col):
return False
return (col.round() != col).any()
def cast_categorical_to_object_dtype(df: pd.DataFrame) -> pd.DataFrame:
"""Cast categorical columns to the object dtype."""
# NOTE:
# pandas have bug with groupby on category dtypes,
# so until it fixed, change dtypes manually
categorical_columns = df.dtypes[df.dtypes == 'category'].index.tolist()
if categorical_columns:
df = df.astype({c: 'object' for c in categorical_columns})
return df
def hide_index_for_display(df: t.Union[pd.DataFrame, pd.io.formats.style.Styler]) -> pd.io.formats.style.Styler:
"""Hide the index of a dataframe for display."""
styler = df.style if isinstance(df, pd.DataFrame) else df
if hasattr(styler, 'hide'):
return styler.hide(axis='index')
return styler.hide_index()