Skip to content

Commit

Permalink
Merge pull request #76 from dayyass/develop
Browse files Browse the repository at this point in the history
release v0.1.4
  • Loading branch information
dayyass committed Oct 10, 2021
2 parents fe805a3 + 56cf117 commit c9d6876
Show file tree
Hide file tree
Showing 16 changed files with 161 additions and 110 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ dist
*.egg-info/

data/train.csv
data/valid.csv
data/test.csv

models/*
!models/README.md
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ WORKDIR /workdir

COPY config.yaml ./
COPY hyperparams.py ./
COPY data/train.csv data/valid.csv data/
COPY data/train.csv data/test.csv data/

RUN pip install --upgrade pip && \
pip install --no-cache-dir text-classification-baseline
Expand Down
20 changes: 20 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
all:
python -m text_clf --path_to_config config.yaml
load_data:
python data/load_20newsgroups.py
coverage:
coverage run -m unittest discover && coverage report -m
docker_build:
docker image build -t text-classification-baseline .
docker_run:
docker container run -it text-classification-baseline
pypi_packages:
pip install --upgrade build twine
pypi_build:
python -m build
pypi_twine:
python -m twine upload --repository testpypi dist/*
pypi_clean:
rm -rf dist text_classification_baseline.egg-info
clean:
rm -rf models/model*
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ text-clf-train --path_to_config config.yaml
```python3
import text_clf

text_clf.train(path_to_config="config.yaml")
model, target_names_mapping = text_clf.train(path_to_config="config.yaml")
```

**NOTE**: more about config file [here](https://github.com/dayyass/text-classification-baseline/tree/main#config).
Expand All @@ -55,7 +55,7 @@ text-clf-train --path_to_config config.yaml
```python3
import text_clf

text_clf.train(path_to_config="config.yaml")
model, target_names_mapping = text_clf.train(path_to_config="config.yaml")
```

Default **config.yaml**:
Expand All @@ -66,7 +66,7 @@ path_to_save_folder: models
# data
data:
train_data_path: data/train.csv
valid_data_path: data/valid.csv
test_data_path: data/test.csv
sep: ','
text_column: text
target_column: target_name_short
Expand Down
2 changes: 1 addition & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ path_to_save_folder: models
# data
data:
train_data_path: data/train.csv
valid_data_path: data/valid.csv
test_data_path: data/test.csv
sep: ','
text_column: text
target_column: target_name_short
Expand Down
2 changes: 1 addition & 1 deletion data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ Folder for storing datasets.

To download [**the 20 newsgroups text dataset**](https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset) run the following command:
```
python fetch_20newsgroups.py
python load_20newsgroups.py
```
19 changes: 9 additions & 10 deletions data/load_20newsgroups.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@


def make_df_from_bunch(bunch: Bunch) -> pd.DataFrame:
"""
Make pd.DataFrame from 20newsgroups bunch.
"""Make pd.DataFrame from 20newsgroups bunch.
Args:
bunch (Bunch): 20newsgroups bunch.
:param Bunch bunch: 20newsgroups bunch.
:return: 20newsgroups DataFrame.
:rtype: pd.DataFrame
Returns:
pd.DataFrame: 20newsgroups DataFrame.
"""

df = pd.DataFrame(
Expand All @@ -27,20 +28,18 @@ def make_df_from_bunch(bunch: Bunch) -> pd.DataFrame:


def load_20newsgroups() -> None:
"""
Load 20newsgroups dataset.
"""
"""Load 20newsgroups dataset."""

train_bunch = fetch_20newsgroups(subset="train")
test_bunch = fetch_20newsgroups(subset="test")

df_train = make_df_from_bunch(train_bunch)
df_valid = make_df_from_bunch(test_bunch)
df_test = make_df_from_bunch(test_bunch)

os.makedirs("data", exist_ok=True)

df_train.to_csv("data/train.csv", index=False)
df_valid.to_csv("data/valid.csv", index=False)
df_test.to_csv("data/test.csv", index=False)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = text-classification-baseline
version = 0.1.3
version = 0.1.4
author = Dani El-Ayyass
author_email = dayyass@yandex.ru
description = TF-IDF + LogReg baseline for text classification
Expand Down
20 changes: 5 additions & 15 deletions tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,11 @@


class TestUsage(unittest.TestCase):
"""
Class for testing pipeline.
"""
"""Class for testing pipeline."""

@classmethod
def setUpClass(cls) -> None:
"""
SetUp tests with config and data.
"""
"""SetUp tests with config and data."""

path_to_config = "config.yaml"

Expand All @@ -28,16 +24,12 @@ def setUpClass(cls) -> None:
load_20newsgroups()

def test_train(self) -> None:
"""
Testing train function.
"""
"""Testing train function."""

train(path_to_config="config.yaml")

def test_train_grid_search(self) -> None:
"""
Testing train function with grid_search.
"""
"""Testing train function with grid_search."""

with open("config.yaml", mode="r") as fp:
config = yaml.safe_load(fp)
Expand All @@ -56,9 +48,7 @@ def test_train_grid_search(self) -> None:

@classmethod
def tearDownClass(cls) -> None:
"""
TearDown after tests.
"""
"""TearDown after tests."""

os.remove("config_grid_search.yaml")

Expand Down
2 changes: 1 addition & 1 deletion text_clf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .__main__ import train

__version__ = "0.1.3"
__version__ = "0.1.4"
__all__ = ["train"]
31 changes: 21 additions & 10 deletions text_clf/__main__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
import traceback
from typing import Dict, Tuple

from sklearn.pipeline import Pipeline

from .config import get_config
from .train import _train
from .utils import close_logger, get_argparse, get_logger


def train(path_to_config: str) -> None:
"""
Function to train baseline model with exception handler.
def train(path_to_config: str) -> Tuple[Pipeline, Dict[int, str]]:
"""Function to train baseline model with exception handler.
:param str path_to_config: path to config.
Args:
path_to_config (str): Path to config.
Returns:
Tuple[Pipeline, Dict[int, str]]:
Model pipeline (tf-idf + logreg) and target names mapping. Both None if any exception occurred.
"""

# load config
Expand All @@ -19,30 +26,34 @@ def train(path_to_config: str) -> None:
logger = get_logger(path_to_logfile=config["path_to_save_logfile"])

try:
_train(
pipe, target_names_mapping = _train(
config=config,
logger=logger,
)

except: # noqa
close_logger(logger)

print(traceback.format_exc())

pipe, target_names_mapping = None, None # type: ignore

return pipe, target_names_mapping


def main() -> int:
"""
Main function to train baseline model.
"""Main function to train baseline model.
:return: exit code.
:rtype: int
Returns:
int: Exit code.
"""

# argument parser
parser = get_argparse()
args = parser.parse_args()

# train
train(path_to_config=args.path_to_config)
_ = train(path_to_config=args.path_to_config)

return 0

Expand Down
24 changes: 14 additions & 10 deletions text_clf/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@


def get_config(path_to_config: str) -> Dict[str, Any]:
"""
Get config.
"""Get config.
Args:
path_to_config (str): Path to config.
:param str path_to_config: path to config.
:return: config.
:rtype: Dict[str, Any]
Returns:
Dict[str, Any]: Config.
"""

with open(path_to_config, mode="r") as fp:
Expand Down Expand Up @@ -55,11 +56,14 @@ def load_default_config(
path_to_save_folder: str = ".",
filename: str = "config.yaml",
) -> None:
"""
Function to load default config.
"""Function to load default config.
Args:
path_to_save_folder (str, optional): Path to save folder. Defaults to ".".
filename (str, optional): Filename. Defaults to "config.yaml".
:param str path_to_save_folder: path to save folder (default: '.').
:param str filename: filename (default: 'config.yaml').
Raises:
FileExistsError: Raise error if config file already exists.
"""

# get logger
Expand All @@ -84,7 +88,7 @@ def load_default_config(
"# data",
"data:",
" train_data_path: data/train.csv",
" valid_data_path: data/valid.csv",
" test_data_path: data/test.csv",
" sep: ','",
" text_column: text",
" target_column: target_name_short",
Expand Down
21 changes: 11 additions & 10 deletions text_clf/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
def load_data(
config: Dict[str, Any]
) -> Tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
"""
Load data.
"""Load data.
Args:
config (Dict[str, Any]): Config.
:param Dict[str, Any] config: config.
:return: X_train, X_valid, y_train, y_valid.
:rtype: Tuple[pd.Series, pd.Series, pd.Series, pd.Series]
Returns:
Tuple[pd.Series, pd.Series, pd.Series, pd.Series]: X_train, X_test, y_train, y_test.
"""

text_column = config["data"]["text_column"]
Expand All @@ -26,15 +27,15 @@ def load_data(
usecols=usecols,
)

df_valid = pd.read_csv(
config["data"]["valid_data_path"],
df_test = pd.read_csv(
config["data"]["test_data_path"],
sep=sep,
usecols=usecols,
)

X_train = df_train[text_column]
X_valid = df_valid[text_column]
X_test = df_test[text_column]
y_train = df_train[target_column]
y_valid = df_valid[target_column]
y_test = df_test[target_column]

return X_train, X_valid, y_train, y_valid
return X_train, X_test, y_train, y_test
12 changes: 6 additions & 6 deletions text_clf/save.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@ def save_model(
target_names_mapping: Dict[int, str],
config: Dict[str, Any],
) -> None:
"""
Save:
- model pipeline (tf-idf + model)
"""Save:
- model pipeline (tf-idf + logreg)
- target names mapping
- config
- hyper-parameters grid (from config)
:param Pipeline pipe: model pipeline (tf-idf + model).
:param Dict[int, str] target_names_mapping: name for each class.
:param Dict[str, Any] config: config.
Args:
pipe (Pipeline): Model pipeline (tf-idf + logreg).
target_names_mapping (Dict[int, str]): Name for each class.
config (Dict[str, Any]): Config.
"""

# save pipe
Expand Down

0 comments on commit c9d6876

Please sign in to comment.