Merge pull request #76 from dayyass/develop

release v0.1.4
dayyass · Oct 10, 2021 · c9d6876 · c9d6876
2 parents fe805a3 + 56cf117
commit c9d6876
Show file tree

Hide file tree

Showing 16 changed files with 161 additions and 110 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,7 +19,7 @@ dist
 *.egg-info/
 
 data/train.csv
-data/valid.csv
+data/test.csv
 
 models/*
 !models/README.md
diff --git a/Dockerfile b/Dockerfile
@@ -5,7 +5,7 @@ WORKDIR /workdir
 
 COPY config.yaml ./
 COPY hyperparams.py ./
-COPY data/train.csv data/valid.csv data/
+COPY data/train.csv data/test.csv data/
 
 RUN pip install --upgrade pip && \
     pip install --no-cache-dir text-classification-baseline

diff --git a/Makefile b/Makefile
@@ -0,0 +1,20 @@
+all:
+	python -m text_clf --path_to_config config.yaml
+load_data:
+	python data/load_20newsgroups.py
+coverage:
+	coverage run -m unittest discover && coverage report -m
+docker_build:
+	docker image build -t text-classification-baseline .
+docker_run:
+	docker container run -it text-classification-baseline
+pypi_packages:
+	pip install --upgrade build twine
+pypi_build:
+	python -m build
+pypi_twine:
+	python -m twine upload --repository testpypi dist/*
+pypi_clean:
+	rm -rf dist text_classification_baseline.egg-info
+clean:
+	rm -rf models/model*
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ text-clf-train --path_to_config config.yaml
 ```python3
 import text_clf
 
-text_clf.train(path_to_config="config.yaml")
+model, target_names_mapping = text_clf.train(path_to_config="config.yaml")
 ```
 
 **NOTE**: more about config file [here](https://github.com/dayyass/text-classification-baseline/tree/main#config).
@@ -55,7 +55,7 @@ text-clf-train --path_to_config config.yaml
 ```python3
 import text_clf
 
-text_clf.train(path_to_config="config.yaml")
+model, target_names_mapping = text_clf.train(path_to_config="config.yaml")
 ```
 
 Default **config.yaml**:
@@ -66,7 +66,7 @@ path_to_save_folder: models
 # data
 data:
   train_data_path: data/train.csv
-  valid_data_path: data/valid.csv
+  test_data_path: data/test.csv
   sep: ','
   text_column: text
   target_column: target_name_short

diff --git a/config.yaml b/config.yaml
@@ -4,7 +4,7 @@ path_to_save_folder: models
 # data
 data:
   train_data_path: data/train.csv
-  valid_data_path: data/valid.csv
+  test_data_path: data/test.csv
   sep: ','
   text_column: text
   target_column: target_name_short

diff --git a/data/README.md b/data/README.md
@@ -4,5 +4,5 @@ Folder for storing datasets.
 
 To download [**the 20 newsgroups text dataset**](https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset) run the following command:
 ```
-python fetch_20newsgroups.py
+python load_20newsgroups.py
 ```
diff --git a/data/load_20newsgroups.py b/data/load_20newsgroups.py
@@ -6,12 +6,13 @@
 
 
 def make_df_from_bunch(bunch: Bunch) -> pd.DataFrame:
-    """
-    Make pd.DataFrame from 20newsgroups bunch.
+    """Make pd.DataFrame from 20newsgroups bunch.
+
+    Args:
+        bunch (Bunch): 20newsgroups bunch.
 
-    :param Bunch bunch: 20newsgroups bunch.
-    :return: 20newsgroups DataFrame.
-    :rtype: pd.DataFrame
+    Returns:
+        pd.DataFrame: 20newsgroups DataFrame.
     """
 
     df = pd.DataFrame(
@@ -27,20 +28,18 @@ def make_df_from_bunch(bunch: Bunch) -> pd.DataFrame:
 
 
 def load_20newsgroups() -> None:
-    """
-    Load 20newsgroups dataset.
-    """
+    """Load 20newsgroups dataset."""
 
     train_bunch = fetch_20newsgroups(subset="train")
     test_bunch = fetch_20newsgroups(subset="test")
 
     df_train = make_df_from_bunch(train_bunch)
-    df_valid = make_df_from_bunch(test_bunch)
+    df_test = make_df_from_bunch(test_bunch)
 
     os.makedirs("data", exist_ok=True)
 
     df_train.to_csv("data/train.csv", index=False)
-    df_valid.to_csv("data/valid.csv", index=False)
+    df_test.to_csv("data/test.csv", index=False)
 
 
 if __name__ == "__main__":

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = text-classification-baseline
-version = 0.1.3
+version = 0.1.4
 author = Dani El-Ayyass
 author_email = dayyass@yandex.ru
 description = TF-IDF + LogReg baseline for text classification

diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
@@ -9,15 +9,11 @@
 
 
 class TestUsage(unittest.TestCase):
-    """
-    Class for testing pipeline.
-    """
+    """Class for testing pipeline."""
 
     @classmethod
     def setUpClass(cls) -> None:
-        """
-        SetUp tests with config and data.
-        """
+        """SetUp tests with config and data."""
 
         path_to_config = "config.yaml"
 
@@ -28,16 +24,12 @@ def setUpClass(cls) -> None:
         load_20newsgroups()
 
     def test_train(self) -> None:
-        """
-        Testing train function.
-        """
+        """Testing train function."""
 
         train(path_to_config="config.yaml")
 
     def test_train_grid_search(self) -> None:
-        """
-        Testing train function with grid_search.
-        """
+        """Testing train function with grid_search."""
 
         with open("config.yaml", mode="r") as fp:
             config = yaml.safe_load(fp)
@@ -56,9 +48,7 @@ def test_train_grid_search(self) -> None:
 
     @classmethod
     def tearDownClass(cls) -> None:
-        """
-        TearDown after tests.
-        """
+        """TearDown after tests."""
 
         os.remove("config_grid_search.yaml")
 

diff --git a/text_clf/__init__.py b/text_clf/__init__.py
@@ -1,4 +1,4 @@
 from .__main__ import train
 
-__version__ = "0.1.3"
+__version__ = "0.1.4"
 __all__ = ["train"]
diff --git a/text_clf/__main__.py b/text_clf/__main__.py
@@ -1,15 +1,22 @@
 import traceback
+from typing import Dict, Tuple
+
+from sklearn.pipeline import Pipeline
 
 from .config import get_config
 from .train import _train
 from .utils import close_logger, get_argparse, get_logger
 
 
-def train(path_to_config: str) -> None:
-    """
-    Function to train baseline model with exception handler.
+def train(path_to_config: str) -> Tuple[Pipeline, Dict[int, str]]:
+    """Function to train baseline model with exception handler.
 
-    :param str path_to_config: path to config.
+    Args:
+        path_to_config (str): Path to config.
+
+    Returns:
+        Tuple[Pipeline, Dict[int, str]]:
+        Model pipeline (tf-idf + logreg) and target names mapping. Both None if any exception occurred.
     """
 
     # load config
@@ -19,30 +26,34 @@ def train(path_to_config: str) -> None:
     logger = get_logger(path_to_logfile=config["path_to_save_logfile"])
 
     try:
-        _train(
+        pipe, target_names_mapping = _train(
             config=config,
             logger=logger,
         )
+
     except:  # noqa
         close_logger(logger)
 
         print(traceback.format_exc())
 
+        pipe, target_names_mapping = None, None  # type: ignore
+
+    return pipe, target_names_mapping
+
 
 def main() -> int:
-    """
-    Main function to train baseline model.
+    """Main function to train baseline model.
 
-    :return: exit code.
-    :rtype: int
+    Returns:
+        int: Exit code.
     """
 
     # argument parser
     parser = get_argparse()
     args = parser.parse_args()
 
     # train
-    train(path_to_config=args.path_to_config)
+    _ = train(path_to_config=args.path_to_config)
 
     return 0
 

diff --git a/text_clf/config.py b/text_clf/config.py
@@ -10,12 +10,13 @@
 
 
 def get_config(path_to_config: str) -> Dict[str, Any]:
-    """
-    Get config.
+    """Get config.
+
+    Args:
+        path_to_config (str): Path to config.
 
-    :param str path_to_config: path to config.
-    :return: config.
-    :rtype: Dict[str, Any]
+    Returns:
+        Dict[str, Any]: Config.
     """
 
     with open(path_to_config, mode="r") as fp:
@@ -55,11 +56,14 @@ def load_default_config(
     path_to_save_folder: str = ".",
     filename: str = "config.yaml",
 ) -> None:
-    """
-    Function to load default config.
+    """Function to load default config.
+
+    Args:
+        path_to_save_folder (str, optional): Path to save folder. Defaults to ".".
+        filename (str, optional): Filename. Defaults to "config.yaml".
 
-    :param str path_to_save_folder: path to save folder (default: '.').
-    :param str filename: filename (default: 'config.yaml').
+    Raises:
+        FileExistsError: Raise error if config file already exists.
     """
 
     # get logger
@@ -84,7 +88,7 @@ def load_default_config(
         "# data",
         "data:",
         "  train_data_path: data/train.csv",
-        "  valid_data_path: data/valid.csv",
+        "  test_data_path: data/test.csv",
         "  sep: ','",
         "  text_column: text",
         "  target_column: target_name_short",

diff --git a/text_clf/data.py b/text_clf/data.py
@@ -6,12 +6,13 @@
 def load_data(
     config: Dict[str, Any]
 ) -> Tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
-    """
-    Load data.
+    """Load data.
+
+    Args:
+        config (Dict[str, Any]): Config.
 
-    :param Dict[str, Any] config: config.
-    :return: X_train, X_valid, y_train, y_valid.
-    :rtype: Tuple[pd.Series, pd.Series, pd.Series, pd.Series]
+    Returns:
+        Tuple[pd.Series, pd.Series, pd.Series, pd.Series]: X_train, X_test, y_train, y_test.
     """
 
     text_column = config["data"]["text_column"]
@@ -26,15 +27,15 @@ def load_data(
         usecols=usecols,
     )
 
-    df_valid = pd.read_csv(
-        config["data"]["valid_data_path"],
+    df_test = pd.read_csv(
+        config["data"]["test_data_path"],
         sep=sep,
         usecols=usecols,
     )
 
     X_train = df_train[text_column]
-    X_valid = df_valid[text_column]
+    X_test = df_test[text_column]
     y_train = df_train[target_column]
-    y_valid = df_valid[target_column]
+    y_test = df_test[target_column]
 
-    return X_train, X_valid, y_train, y_valid
+    return X_train, X_test, y_train, y_test
diff --git a/text_clf/save.py b/text_clf/save.py
@@ -11,16 +11,16 @@ def save_model(
     target_names_mapping: Dict[int, str],
     config: Dict[str, Any],
 ) -> None:
-    """
-    Save:
-        - model pipeline (tf-idf + model)
+    """Save:
+        - model pipeline (tf-idf + logreg)
         - target names mapping
         - config
         - hyper-parameters grid (from config)
 
-    :param Pipeline pipe: model pipeline (tf-idf + model).
-    :param Dict[int, str] target_names_mapping: name for each class.
-    :param Dict[str, Any] config: config.
+    Args:
+        pipe (Pipeline): Model pipeline (tf-idf + logreg).
+        target_names_mapping (Dict[int, str]): Name for each class.
+        config (Dict[str, Any]): Config.
     """
 
     # save pipe