-
Notifications
You must be signed in to change notification settings - Fork 4
/
utils.py
271 lines (221 loc) · 11.6 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
from __future__ import annotations
from pathlib import Path
from typing import Any
from unittest import mock
import numpy as np
import pandas as pd
import regex
from dsp_tools.commands.excel2json.models.input_error import DuplicatesInColumnProblem
from dsp_tools.commands.excel2json.models.input_error import InvalidSheetNameProblem
from dsp_tools.commands.excel2json.models.input_error import RequiredColumnMissingProblem
from dsp_tools.models.exceptions import InputError
languages = ["en", "de", "fr", "it", "rm"]
def read_and_clean_all_sheets(excelfile: str | Path) -> dict[str, pd.DataFrame]:
"""
This function reads an Excel file with all its sheets.
If there is a ValueError, it patches the openpyxl part that causes the error
and opens it with that patch.
It cleans the dataframes and then returns them in the form {sheet_name: dataframe}.
Args:
excelfile: path to the Excel file
Returns:
All sheets of the excel file, in the form of a dictionary {sheet_name: dataframe}
Raises:
InputError: If the sheets are not correctly named
"""
try:
df_dict = pd.read_excel(excelfile, sheet_name=None)
except ValueError:
# Pandas relies on openpyxl to parse XLSX files.
# A strange behavior of openpyxl prevents pandas from opening files with some formatting properties
# (unclear which formatting properties exactly).
# Apparently, the excel2json test files have one of the unsupported formatting properties.
# Credits: https://stackoverflow.com/a/70537454/14414188
with mock.patch("openpyxl.styles.fonts.Font.family.max", new=100):
df_dict = pd.read_excel(excelfile, sheet_name=None)
try:
return {name.strip(""): clean_data_frame(df) for name, df in df_dict.items()}
except AttributeError:
msg = InvalidSheetNameProblem(str(excelfile), list(df_dict.keys())).execute_error_protocol()
raise InputError(msg) from None
def clean_data_frame(df: pd.DataFrame) -> pd.DataFrame:
"""
This function takes a pd.DataFrame and removes:
- Leading and trailing spaces from the column names
- Leading and trailing spaces from each cell and any characters in the cells that are not part of any known
language, for example, linebreaks and replaces it with a pd.NA.
- Removes all rows that are empty in all columns
Args:
df: The pd.DataFrame that is to be cleaned
Returns:
pd.DataFrame which has the above-mentioned removed
"""
# Remove leading and trailing blanks in column names and make them lower case
df = df.rename(columns=lambda x: x.strip().lower())
# Remove the values of all cells that do not at least contain one character of any known language and removes
# leading and trailing spaces.
df = df.map(
lambda x: str(x).strip() if pd.notna(x) and regex.search(r"[\w\p{L}]", str(x), flags=regex.U) else pd.NA
)
# drop all the rows that are entirely empty
df = df.dropna(axis=0, how="all")
return df
def check_contains_required_columns(
df: pd.DataFrame, required_columns: set[str]
) -> None | RequiredColumnMissingProblem:
"""
This function checks if all the columns from the set are in the pd.DataFrame.
Additional columns to the ones in the set are allowed.
Args:
df: pd.DataFrame that is checked
required_columns: set of column names
Returns:
An object if there is a problem else None.
"""
if not required_columns.issubset(set(df.columns)):
required = list(required_columns.difference(set(df.columns)))
return RequiredColumnMissingProblem(columns=required)
return None
def check_column_for_duplicate(df: pd.DataFrame, to_check_column: str) -> None | DuplicatesInColumnProblem:
"""
This function checks if a specified column contains duplicate values.
Empty cells (pd.NA) also count as duplicates.
Args:
df: pd.DataFrame that is checked for duplicates
to_check_column: Name of the column that must not contain duplicates
Returns:
If there are problems it returns an object that stores the relevant user information.
"""
if df[to_check_column].duplicated().any():
duplicate_values = df[to_check_column][df[to_check_column].duplicated()].tolist()
return DuplicatesInColumnProblem(
column=to_check_column,
duplicate_values=duplicate_values,
)
else:
return None
def check_required_values(df: pd.DataFrame, required_values_columns: list[str]) -> dict[str, pd.Series[bool]]:
"""
If there are any empty cells in the column, it adds the column name and a boolean pd.Series to the dictionary.
If there are no empty cells, then it is not included in the dictionary.
If no column has any empty cells, then it returns an empty dictionary.
Args:
df: pd.DataFrame that is checked
required_values_columns: a list of column names that may not contain empty cells
Returns:
a dictionary with the column names as key and pd.Series as values if there are any empty cells
"""
# It checks if any of the values in a specified column are empty. If they are, they are added to the dictionary
# with the column name as key and a boolean series as value that contain true for every pd.NA
# If all the columns are filled, then it returns an empty dictionary.
return {col: df[col].isnull() for col in required_values_columns if df[col].isnull().any()}
def turn_bool_array_into_index_numbers(series: pd.Series[bool], true_remains: bool = True) -> list[int]:
"""
This function takes a pd.Series containing boolean values.
By default, this method extracts the index numbers of the True values.
If the index numbers of the False values are required, the parameter "true_remains" should be turned to False.
Args:
series: pd.Series, which only contains True and False values
true_remains: True if the index numbers of True are required, likewise with False
Returns:
A list of index numbers
"""
# If the False are required, we need to invert the array.
if not true_remains:
series = ~series
return list(series[series].index)
def get_wrong_row_numbers(
wrong_row_dict: dict[str, pd.Series[bool]], true_remains: bool = True
) -> dict[str, list[int]]:
"""
From the boolean pd.Series the index numbers of the True values are extracted.
The resulting list is the new value of the dictionary.
This new dictionary is taken and to each index number 2 is added, so that it corresponds to the Excel row number.
Args:
wrong_row_dict: The dictionary which contains column names and a boolean pd.Series
true_remains: If True then the index of True is taken, if False then the index of False values is taken
Returns:
Dictionary with the column name as key and the row number as a list.
"""
wrong_row_int_dict = {
k: turn_bool_array_into_index_numbers(series=v, true_remains=true_remains) for k, v in wrong_row_dict.items()
}
return {k: [x + 2 for x in v] for k, v in wrong_row_int_dict.items()}
def get_labels(df_row: pd.Series[Any]) -> dict[str, str]:
"""
This function takes a pd.Series which has "label_[language tag]" in the index.
If the value of the index is not pd.NA, the language tag and the value are added to a dictionary.
If it is empty, it is omitted from the dictionary.
Args:
df_row: a pd.Series (usually a row of a pd.DataFrame) from which the content of the columns containing the
label is extracted
Returns:
A dictionary with the language tag and the content of the cell
"""
return {lang: df_row[f"label_{lang}"] for lang in languages if not pd.isna(df_row[f"label_{lang}"])}
def get_comments(df_row: pd.Series[Any]) -> dict[str, str] | None:
"""
This function takes a pd.Series which has "comment_[language tag]" in the index.
If the value of the index is not pd.NA, the language tag and the value are added to a dictionary.
If it is empty, it is omitted from the dictionary.
Args:
df_row: a pd.Series (usually a row of a pd.DataFrame) from which the content of the columns containing the
comment is extracted
Returns:
A dictionary with the language tag and the content of the cell
"""
comments = {lang: df_row[f"comment_{lang}"] for lang in languages if not pd.isna(df_row[f"comment_{lang}"])}
return comments or None
def find_one_full_cell_in_cols(df: pd.DataFrame, required_columns: list[str]) -> pd.Series[bool] | None:
"""
This function takes a pd.DataFrame and a list of column names where at least one cell must have a value per row.
A pd.Series with boolean values is returned, True if any rows do not have a value in at least one column
Args:
df: The pd.DataFrame which should be checked
required_columns: A list of column names where at least one cell per row must have a value
Returns:
None if there is no problem or a pd.Series if there is a problem in a row
"""
# The series has True if the cell is empty
# In order to combine more than two arrays, we need to reduce the arrays, which takes a tuple
result_arrays = tuple(df[col].isnull() for col in required_columns)
# If all are True logical_and returns True otherwise False
combined_array = np.logical_and.reduce(result_arrays)
# if any of the values are True, it is turned into a pd.Series
return pd.Series(combined_array) if any(combined_array) else None
def col_must_or_not_empty_based_on_other_col(
df: pd.DataFrame,
substring_list: list[str],
substring_colname: str,
check_empty_colname: str,
must_have_value: bool,
) -> pd.Series[bool] | None:
"""
It is presumed that the column "substring_colname" has no empty cells.
Based on the string content of the individual rows, which is specified in the "substring_list",
the cell in the column "check_empty_colname" is checked whether it is empty or not.
The "substring_list" contains the different possibilities regarding the content of the cell.
If the parameter "must_have_value" is True, then the cell in the "check_empty_colname" column must not be empty.
If the parameter is set to False, then it must be empty.
Args:
df: The pd.DataFrame which is checked
substring_list: A list of possible information that could be in the column "substring_colname"
substring_colname: The name of the column that may contain any of the sub-strings
check_empty_colname: The name of the column which is checked if it is empty or not
must_have_value: True if the "check_empty_colname" should have a value or the reverse.
Returns:
None if all rows are correctly filled or empty.
A series which contains True values for the rows, where it does
not comply with the specifications.
"""
na_series = df[check_empty_colname].isna()
# If the cells have to be empty, we need to reverse the series
if not must_have_value:
na_series = ~na_series
# This returns True if it finds the substring in the cell, they are joined in a RegEx "|" which denotes "or".
# If it does not match any of the sub-strings, then the RegEx returns False,
# which means that the value in the column "check_empty_colname" is not relevant.
substring_array = df[substring_colname].str.contains("|".join(substring_list), na=False, regex=True)
# If both are True logical_and returns True otherwise False
combined_array = np.logical_and(na_series, substring_array)
return pd.Series(combined_array) if any(combined_array) else None