From aba9aefa8af8bf9086a102118b2f69d5856a84f2 Mon Sep 17 00:00:00 2001 From: Nora-Olivia-Ammann <103038637+Nora-Olivia-Ammann@users.noreply.github.com> Date: Thu, 1 Feb 2024 12:16:30 +0100 Subject: [PATCH] chore: remove dead code with vulture (#790) --- src/dsp_tools/commands/excel2json/utils.py | 46 ------------------- .../commands/fast_xmlupload/process_files.py | 34 -------------- .../commands/excel2json/test_utils.py | 9 ---- 3 files changed, 89 deletions(-) diff --git a/src/dsp_tools/commands/excel2json/utils.py b/src/dsp_tools/commands/excel2json/utils.py index 82e09e76a..7e27441d7 100644 --- a/src/dsp_tools/commands/excel2json/utils.py +++ b/src/dsp_tools/commands/excel2json/utils.py @@ -1,7 +1,6 @@ from __future__ import annotations from pathlib import Path -from typing import Any from unittest import mock import numpy as np @@ -18,33 +17,6 @@ languages = ["en", "de", "fr", "it", "rm"] -def read_and_clean_excel_file(excelfile: str, sheetname: str | int = 0) -> pd.DataFrame: - """ - This function reads an Excel file, if there is a ValueError then it patches the openpyxl part that creates the - error and opens it with that patch. - It cleans and then returns the pd.DataFrame. - - Args: - excelfile: The name of the Excel file - sheetname: The name or index (zero-based) of the Excel sheet, by default it reads the first - - Returns: - A pd.DataFrame - """ - try: - read_df: pd.DataFrame = pd.read_excel(excelfile, sheet_name=sheetname) - except ValueError: - # Pandas relies on openpyxl to parse XLSX files. - # A strange behavior of openpyxl prevents pandas from opening files with some formatting properties - # (unclear which formatting properties exactly). - # Apparently, the excel2json test files have one of the unsupported formatting properties. - # Credits: https://stackoverflow.com/a/70537454/14414188 - with mock.patch("openpyxl.styles.fonts.Font.family.max", new=100): - read_df = pd.read_excel(excelfile, sheet_name=sheetname) - read_df = clean_data_frame(df=read_df) - return read_df - - def read_and_clean_all_sheets(excelfile: str | Path) -> dict[str, pd.DataFrame]: """ This function reads an Excel file with all its sheets. @@ -204,24 +176,6 @@ def get_wrong_row_numbers(wrong_row_dict: dict[str, pd.Series], true_remains: bo return {k: [x + 2 for x in v] for k, v in wrong_row_dict.items()} -def update_dict_if_not_value_none(additional_dict: dict[Any, Any], to_update_dict: dict[Any, Any]) -> dict[Any, Any]: - """ - This function takes two dictionaries. - The "to_update_dict" should be updated with the information from the "additional_dict" - only if the value of a particular key is not None or pd.NA. - - Args: - additional_dict: The dictionary which contains information that may be transferred - to_update_dict: The dictionary to which the new information should be transferred - - Returns: - The "to_update_dict" which the additional information - """ - additional_dict = {k: v for k, v in additional_dict.items() if v is not None and v is not pd.NA} - to_update_dict.update(additional_dict) - return to_update_dict - - def get_labels(df_row: pd.Series) -> dict[str, str]: """ This function takes a pd.Series which has "label_[language tag]" in the index. diff --git a/src/dsp_tools/commands/fast_xmlupload/process_files.py b/src/dsp_tools/commands/fast_xmlupload/process_files.py index a45edb366..59d7bb1c0 100644 --- a/src/dsp_tools/commands/fast_xmlupload/process_files.py +++ b/src/dsp_tools/commands/fast_xmlupload/process_files.py @@ -766,40 +766,6 @@ def handle_interruption( sys.exit(1) -def double_check_unprocessed_files( - all_files: list[Path], - processed_files: list[Path], - unprocessed_files: list[Path], -) -> None: - """ - Checks if the files in 'unprocessed_files.txt' are consistent with the files in 'processed_files.txt'. - - Args: - all_files: list of all paths in the tags of the XML file - processed_files: the paths from 'processed_files.txt' - unprocessed_files: the paths from 'unprocessed_files.txt' (or all_files if there is no such file) - - Raises: - UserError: if there is a file 'unprocessed_files.txt', but no file 'processed_files.txt' - UserError: if the files 'unprocessed_files.txt' and 'processed_files.txt' are inconsistent - """ - unprocessed_files_txt_exists = sorted(unprocessed_files) != sorted(all_files) - if unprocessed_files_txt_exists and not processed_files: - logger.error("There is a file 'unprocessed_files.txt', but no file 'processed_files.txt'") - raise UserError("There is a file 'unprocessed_files.txt', but no file 'processed_files.txt'") - - if processed_files and sorted(unprocessed_files) == sorted(all_files): - logger.error("There is a file 'processed_files.txt', but no file 'unprocessed_files.txt'") - raise UserError("There is a file 'processed_files.txt', but no file 'unprocessed_files.txt'") - - if unprocessed_files_txt_exists: - # there is a 'unprocessed_files.txt' file. check it for consistency - unprocessed_files_from_processed_files = [x for x in all_files if x not in processed_files] - if sorted(unprocessed_files_from_processed_files) != sorted(unprocessed_files): - logger.error("The files 'unprocessed_files.txt' and 'processed_files.txt' are inconsistent") - raise UserError("The files 'unprocessed_files.txt' and 'processed_files.txt' are inconsistent") - - def process_files( input_dir: str, output_dir: str, diff --git a/test/unittests/commands/excel2json/test_utils.py b/test/unittests/commands/excel2json/test_utils.py index e39746ef3..21f34cacb 100644 --- a/test/unittests/commands/excel2json/test_utils.py +++ b/test/unittests/commands/excel2json/test_utils.py @@ -85,15 +85,6 @@ def test_get_wrong_row_numbers(self) -> None: returned_dict = utl.get_wrong_row_numbers(wrong_row_dict=original_dict, true_remains=True) self.assertDictEqual(expected_dict, returned_dict) - def test_update_dict_if_not_value_none(self) -> None: - original_dict = {0: 0} - original_update_dict = {1: 1, 2: 2, 3: None, 4: pd.NA, 5: "5"} - expected_dict = {0: 0, 1: 1, 2: 2, 5: "5"} - returned_dict = utl.update_dict_if_not_value_none( - additional_dict=original_update_dict, to_update_dict=original_dict - ) - self.assertDictEqual(expected_dict, returned_dict) - def test_find_one_full_cell_in_cols(self) -> None: required_cols = ["label_en", "label_de", "label_fr", "label_it", "label_rm"] original_df = pd.DataFrame(